Library imports

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
Mounted at /content/drive/

Problem Statement

The aim of this competition is predict a Book's Price. The model could be used by a Publishing house to determine a price of the new book they plan to launch.

In [2]:
!pip install cloud-tpu-client
!gdown --id 1wg8LXo5UuFWMMP-x5xatcgdwVTKUr6nM # Download the glove embedding file
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Requirement already satisfied: oauth2client in /usr/local/lib/python3.7/dist-packages (from cloud-tpu-client) (4.1.3)
Collecting google-api-python-client==1.8.0
  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
     |████████████████████████████████| 61kB 4.0MB/s 
Requirement already satisfied: six>=1.6.1 in /usr/local/lib/python3.7/dist-packages (from oauth2client->cloud-tpu-client) (1.15.0)
Requirement already satisfied: httplib2>=0.9.1 in /usr/local/lib/python3.7/dist-packages (from oauth2client->cloud-tpu-client) (0.17.4)
Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from oauth2client->cloud-tpu-client) (4.7.2)
Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.7/dist-packages (from oauth2client->cloud-tpu-client) (0.4.8)
Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.7/dist-packages (from oauth2client->cloud-tpu-client) (0.2.8)
Requirement already satisfied: google-auth>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client==1.8.0->cloud-tpu-client) (1.31.0)
Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client==1.8.0->cloud-tpu-client) (0.0.4)
Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client==1.8.0->cloud-tpu-client) (3.0.1)
Requirement already satisfied: google-api-core<2dev,>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client==1.8.0->cloud-tpu-client) (1.26.3)
Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth>=1.4.1->google-api-python-client==1.8.0->cloud-tpu-client) (4.2.2)
Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-auth>=1.4.1->google-api-python-client==1.8.0->cloud-tpu-client) (57.0.0)
Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (1.53.0)
Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (20.9)
Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (2.23.0)
Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (2018.9)
Requirement already satisfied: protobuf>=3.12.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (3.12.4)
Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (2.4.7)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (1.24.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (2021.5.30)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.13.0->google-api-python-client==1.8.0->cloud-tpu-client) (3.0.4)
ERROR: earthengine-api 0.1.269 has requirement google-api-python-client<2,>=1.12.1, but you'll have google-api-python-client 1.8.0 which is incompatible.
Installing collected packages: google-api-python-client, cloud-tpu-client
  Found existing installation: google-api-python-client 1.12.8
    Uninstalling google-api-python-client-1.12.8:
      Successfully uninstalled google-api-python-client-1.12.8
Successfully installed cloud-tpu-client-0.10 google-api-python-client-1.8.0
Downloading...
From: https://drive.google.com/uc?id=1wg8LXo5UuFWMMP-x5xatcgdwVTKUr6nM
To: /content/glove.6B.100d.txt
347MB [00:03, 90.7MB/s]
In [3]:
import os
import pandas as pd
import logging
import numpy as np
import re
import pickle

from importlib.machinery import SourceFileLoader

import plotly.express as px
import plotly.graph_objects as go
import seaborn as sn
import plotly.io as pio

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import (mean_squared_error,\
                             mean_absolute_error)

from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
In [4]:
logging.basicConfig(level='DEBUG')


logger = logging.getLogger()


PROJECT_PATH = os.path.join(os.getcwd(),\
                            'drive',\
                            'MyDrive',\
                            'Datasets',\
                            'MachineHack',\
                            'PredictPriceBook')
logger.debug(f'directory has {os.listdir(PROJECT_PATH)}')

LIBRARY_PATH = os.path.join(os.getcwd(),\
                            'drive',\
                            'MyDrive',\
                            'mylib')
logger.debug(f'directory has {os.listdir(LIBRARY_PATH)}')


logger.debug(f"max col width setting is {pd.\
            get_option('display.max_colwidth')}")
pd.set_option('display.max_colwidth',100)

logger.debug(f"max columns setting is {pd.\
            get_option('display.max_columns')}")
pd.set_option('display.max_columns',100)

logger.debug(f"display.float fomat is {pd.\
            get_option('display.float_format')}")
pd.set_option('display.float_format', '{:.3f}'.format)

logger.debug(f'Default renderer is {pio.renderers}')
#Setting renderer
pio.renderers.default = 'notebook_connected'
logger.debug(f'New renderer is {pio.renderers}')
DEBUG:root:directory has ['Data_Test.xlsx', 'Data_Train.xlsx', 'Sample_Submission.xlsx', 'eval', 'model', 'Stacking.xlsx', 'LinearReg.xlsx', 'RandomForest.xlsx', 'SVM.xlsx', 'XGBoost.xlsx', 'RandomForestglove.xlsx', 'LinearRegglove.xlsx', 'SVMglove.xlsx', 'XGBoostglove.xlsx', 'LinearRegtfidf.xlsx', 'SVMtfidf.xlsx', 'XGBoosttfidf.xlsx', 'RandomForesttfidf.xlsx', 'Stackingtfidf.xlsx', 'LinearReggloveSynopsistfidfTitle.xlsx', 'SVMgloveSynopsistfidfTitle.xlsx', 'XGBoostgloveSynopsistfidfTitle.xlsx', 'RandomForestgloveSynopsistfidfTitle.xlsx', 'StackinggloveSynopsistfidfTitle.xlsx', 'LinearRegtfidfSynopsistfidfTitle.xlsx', 'RandomForesttfidfSynopsistfidfTitle.xlsx', 'SVMtfidfSynopsistfidfTitle.xlsx', 'XGBoosttfidfSynopsistfidfTitle.xlsx', 'StackingtfidfSynopsistfidfTitle.xlsx']
DEBUG:root:directory has ['__pycache__', 'eval.py', 'eda.py', 'feature_engineering.py', 'model_building.py']
DEBUG:root:max col width setting is 50
DEBUG:root:max columns setting is 0
DEBUG:root:display.float fomat is None
DEBUG:root:Default renderer is Renderers configuration
-----------------------
    Default renderer: 'colab'
    Available renderers:
        ['plotly_mimetype', 'jupyterlab', 'nteract', 'vscode',
         'notebook', 'notebook_connected', 'kaggle', 'azure', 'colab',
         'cocalc', 'databricks', 'json', 'png', 'jpeg', 'jpg', 'svg',
         'pdf', 'browser', 'firefox', 'chrome', 'chromium', 'iframe',
         'iframe_connected', 'sphinx_gallery']

DEBUG:root:New renderer is Renderers configuration
-----------------------
    Default renderer: 'notebook_connected'
    Available renderers:
        ['plotly_mimetype', 'jupyterlab', 'nteract', 'vscode',
         'notebook', 'notebook_connected', 'kaggle', 'azure', 'colab',
         'cocalc', 'databricks', 'json', 'png', 'jpeg', 'jpg', 'svg',
         'pdf', 'browser', 'firefox', 'chrome', 'chromium', 'iframe',
         'iframe_connected', 'sphinx_gallery']

In [5]:
eda = SourceFileLoader('eda',\
                        os.path.join(LIBRARY_PATH,\
                                     'eda.py')).load_module()

eval = SourceFileLoader('eval',\
                        os.path.join(LIBRARY_PATH,\
                                     'eval.py')).load_module()

mb = SourceFileLoader('model_building',\
                        os.path.join(LIBRARY_PATH,\
                                     'model_building.py')).load_module()

fe = SourceFileLoader('feature_engineering',\
                        os.path.join(LIBRARY_PATH,\
                                     'feature_engineering.py')).load_module()
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7

Error metric

In [6]:
el = eval.RegressionEvaluation(precision=3)
el.load_data(os.path.join(PROJECT_PATH,'eval','eval.csv'))
logger.debug(f'Here is the loaded metric file \n {el.get_metrics()}')
DEBUG:root:Here is the loaded metric file 
                                              model  r_squared  train_mse  \
0                                        XGBoostRF      0.224      0.086   
1                                         Stacking      0.666      0.037   
2                                        LinearReg      0.499      0.056   
3                                     DecisionTree      0.369      0.070   
4                                     RandomForest      0.693      0.034   
5                          RandomForestTopFeatures      0.701      0.033   
6                                              SVM      0.164      0.093   
7                                          XGBoost      0.676      0.036   
8                               XGBoostTopFeatures      0.693      0.034   
9                     RandomForestTopFeaturesglove      0.834      0.018   
10                               RandomForestglove      0.724      0.030   
11                         XGBoostTopFeaturesglove      0.774      0.025   
12                                  LinearRegglove      0.902      0.011   
13                               DecisionTreeglove      0.880      0.013   
14                                        SVMglove      0.946      0.006   
15                                    XGBoostglove      0.880      0.013   
16                                  LinearRegtfidf      0.647      0.039   
17                                 ElasticNettfidf      0.638      0.040   
18                               DecisionTreetfidf      0.602      0.044   
19                                        SVMtfidf      0.766      0.026   
20                                    XGBoosttfidf      0.784      0.024   
21                         XGBoostTopFeaturestfidf      0.781      0.024   
22                               RandomForesttfidf      0.858      0.016   
23                    RandomForestTopFeaturestfidf      0.859      0.015   
24                                   Stackingtfidf      0.813      0.020   
25                LinearReggloveSynopsistfidfTitle      0.633      0.040   
26               ElasticNetgloveSynopsistfidfTitle      0.621      0.041   
27             DecisionTreegloveSynopsistfidfTitle      0.548      0.049   
28                      SVMgloveSynopsistfidfTitle      0.751      0.027   
29                  XGBoostgloveSynopsistfidfTitle      0.832      0.018   
30       XGBoostTopFeaturesgloveSynopsistfidfTitle      0.832      0.018   
31             RandomForestgloveSynopsistfidfTitle      0.863      0.015   
32  RandomForestTopFeaturesgloveSynopsistfidfTitle      0.863      0.015   
33                 StackinggloveSynopsistfidfTitle      0.820      0.020   
34                LinearRegtfidfSynopsistfidfTitle      0.649      0.039   
35               ElasticNettfidfSynopsistfidfTitle      0.639      0.040   
36             DecisionTreetfidfSynopsistfidfTitle      0.601      0.044   
37             RandomForesttfidfSynopsistfidfTitle      0.855      0.016   
38  RandomForestTopFeaturestfidfSynopsistfidfTitle      0.857      0.016   
39                      SVMtfidfSynopsistfidfTitle      0.767      0.025   
40                  XGBoosttfidfSynopsistfidfTitle      0.782      0.024   
41       XGBoostTopFeaturestfidfSynopsistfidfTitle      0.803      0.022   
42                 StackingtfidfSynopsistfidfTitle      0.815      0.020   

    train_mae  train_lmae                   val_mse       val_mae  val_lmae  
0       0.220       1.084                     0.088         0.220     1.086  
1       0.143       1.055                     0.057         0.177     1.067  
2       0.178       1.068                     0.066         0.192     1.073  
3       0.198       1.077                     0.086         0.221     1.084  
4       0.136       1.053                     0.064         0.186     1.071  
5       0.134       1.052                     0.063         0.186     1.071  
6       0.223       1.088                     0.107         0.243     1.093  
7       0.143       1.054                     0.060         0.181     1.069  
8       0.140       1.053                     0.059         0.180     1.069  
9       0.096       1.039                     0.066         0.185     1.072  
10      0.127       1.050                     0.069         0.194     1.075  
11      0.118       1.045                     0.067         0.194     1.074  
12      0.063       1.029                     0.013         0.068     1.033  
13      0.067       1.033                     0.015         0.071     1.035  
14      0.062       1.022                     0.014         0.080     1.034  
15      0.065       1.033                     0.014         0.067     1.034  
16      0.145       1.056 299642795180295913472.000 400081986.400     1.831  
17      0.146       1.056                     0.045         0.155     1.060  
18      0.149       1.059                     0.053         0.167     1.066  
19      0.114       1.045                     0.042         0.149     1.058  
20      0.110       1.043                     0.043         0.148     1.059  
21      0.111       1.044                     0.043         0.147     1.059  
22      0.078       1.035                     0.044         0.143     1.059  
23      0.078       1.035                     0.044         0.143     1.059  
24      0.098       1.040                     0.041         0.143     1.057  
25      0.148       1.057                     0.048         0.161     1.062  
26      0.149       1.057                     0.046         0.157     1.061  
27      0.160       1.063                     0.055         0.170     1.066  
28      0.118       1.046                     0.045         0.153     1.060  
29      0.098       1.038                     0.045         0.149     1.060  
30      0.098       1.038                     0.045         0.151     1.060  
31      0.076       1.034                     0.046         0.147     1.061  
32      0.077       1.034                     0.046         0.147     1.061  
33      0.099       1.040                     0.043         0.146     1.059  
34      0.145       1.055   6656443694119302144.000  59630466.965     1.752  
35      0.146       1.056                     0.045         0.155     1.060  
36      0.149       1.059                     0.053         0.165     1.065  
37      0.078       1.035                     0.044         0.143     1.059  
38      0.078       1.035                     0.044         0.143     1.059  
39      0.114       1.045                     0.042         0.149     1.058  
40      0.110       1.044                     0.042         0.147     1.058  
41      0.106       1.041                     0.044         0.150     1.060  
42      0.098       1.040                     0.041         0.144     1.057  

Data exploration

In [7]:
logger.debug(f'Files in directory are {os.listdir(PROJECT_PATH)}')
DEBUG:root:Files in directory are ['Data_Test.xlsx', 'Data_Train.xlsx', 'Sample_Submission.xlsx', 'eval', 'model', 'Stacking.xlsx', 'LinearReg.xlsx', 'RandomForest.xlsx', 'SVM.xlsx', 'XGBoost.xlsx', 'RandomForestglove.xlsx', 'LinearRegglove.xlsx', 'SVMglove.xlsx', 'XGBoostglove.xlsx', 'LinearRegtfidf.xlsx', 'SVMtfidf.xlsx', 'XGBoosttfidf.xlsx', 'RandomForesttfidf.xlsx', 'Stackingtfidf.xlsx', 'LinearReggloveSynopsistfidfTitle.xlsx', 'SVMgloveSynopsistfidfTitle.xlsx', 'XGBoostgloveSynopsistfidfTitle.xlsx', 'RandomForestgloveSynopsistfidfTitle.xlsx', 'StackinggloveSynopsistfidfTitle.xlsx', 'LinearRegtfidfSynopsistfidfTitle.xlsx', 'RandomForesttfidfSynopsistfidfTitle.xlsx', 'SVMtfidfSynopsistfidfTitle.xlsx', 'XGBoosttfidfSynopsistfidfTitle.xlsx', 'StackingtfidfSynopsistfidfTitle.xlsx']
In [8]:
dataset = pd.read_excel(os.path.join(PROJECT_PATH,'Data_Train.xlsx'))
test_dataset = pd.read_excel(os.path.join(PROJECT_PATH,'Data_Test.xlsx'))
logger.debug(f'Shape of Train Dataset is {dataset.shape}')
logger.debug(f'Shape of Test Dataset is {test_dataset.shape}')
DEBUG:root:Shape of Train Dataset is (6237, 9)
DEBUG:root:Shape of Test Dataset is (1560, 8)
In [9]:
val_dataset = dataset[dataset['Title'].\
        isin(test_dataset['Title'])]
val_index = val_dataset.index
logger.debug(f'val_dataset shape is {val_dataset.shape}')
DEBUG:root:val_dataset shape is (362, 9)
In [10]:
eda.explain_data(dataset)
INFO:numexpr.utils:NumExpr defaulting to 2 threads.
The data has 6237 rows and 9 columns
Below are the column wise data-types,missing values, unique level and descriptive stats of the data
Out[10]:
dtypes missing_values unique_values count unique top freq mean std min 25% 50% 75% max
Title object 0 5568 6237 5568 Casino Royale: James Bond 007 (Vintage) 4 NaN NaN NaN NaN NaN NaN NaN
Author object 0 3679 6237 3679 Agatha Christie 69 NaN NaN NaN NaN NaN NaN NaN
Edition object 0 3370 6237 3370 Paperback,– 5 Oct 2017 48 NaN NaN NaN NaN NaN NaN NaN
Reviews object 0 36 6237 36 5.0 out of 5 stars 1375 NaN NaN NaN NaN NaN NaN NaN
Ratings object 0 342 6237 342 1 customer review 1040 NaN NaN NaN NaN NaN NaN NaN
Synopsis object 0 5549 6237 5549 A Tinkle Double Digest is two Tinkle Digests in one volume. These include the best stories of Ti... 8 NaN NaN NaN NaN NaN NaN NaN
Genre object 0 345 6237 345 Action & Adventure (Books) 947 NaN NaN NaN NaN NaN NaN NaN
BookCategory object 0 11 6237 11 Action & Adventure 818 NaN NaN NaN NaN NaN NaN NaN
Price float64 0 1614 6237.000 NaN NaN NaN 560.708 690.111 25.000 249.180 373.000 599.000 14100.000

Exploring each feature and its behavior with Target

In [11]:
test_dataset.loc[:,'Price'] = -1

merge_dataset = pd.concat([dataset,test_dataset])
merge_dataset = merge_dataset.reset_index()

Price

Price requires a log transformation due to huge range in values

In [12]:
condition = (merge_dataset['Price'] != -1)
fig = px.histogram(merge_dataset[condition],
                   x='Price')

fig.show()

merge_dataset['LogPrice'] = merge_dataset['Price'].\
apply(lambda x: np.log10(x) if x != -1 else -1)

fig = px.histogram(merge_dataset[condition],
                   x='LogPrice')

fig.show()
In [13]:
# condition = ((merge_dataset['LogPrice'] > 1.5) |\
#              (merge_dataset['LogPrice'] == -1))
# merge_dataset = merge_dataset[condition].copy()
logger.debug(f'Merge_dataset shape is {merge_dataset.shape}')
DEBUG:root:Merge_dataset shape is (7797, 11)
In [14]:
log_price_bucket = pd.cut(merge_dataset['LogPrice'],\
                          bins=[-2,0,2.3522,2.51,2.645,2.8459,10],\
                          labels=['Target',\
                                  'Very Low',\
                                  'Low',\
                                  'Med',\
                                  'High',\
                                  'Very High'])
merge_dataset['LogPriceBucket'] = log_price_bucket
merge_dataset['LogPriceBucket'] = merge_dataset['LogPriceBucket'].astype('object')
logger.debug(f"Price buckets are \n {merge_dataset['LogPriceBucket'].\
                                     value_counts(normalize=True)}")
DEBUG:root:Price buckets are 
 Target      0.200
Low         0.163
Very High   0.162
Very Low    0.162
Med         0.161
High        0.152
Name: LogPriceBucket, dtype: float64

Title

Analyzing records where Title is repeating tells us that Book prices are varying as per category. There is also presence of certain cases where a books has varying price for different instances while all other fields are same.

In [15]:
#Analyzing top repeats
merge_dataset[merge_dataset['Title'].\
        isin(['Casino Royale: James Bond 007 (Vintage)'])].\
                               sort_values(by='Title')
Out[15]:
index Title Author Edition Reviews Ratings Synopsis Genre BookCategory Price LogPrice LogPriceBucket
2111 2111 Casino Royale: James Bond 007 (Vintage) Ian Fleming Paperback,– 2 Aug 2012 4.6 out of 5 stars 14 customer reviews The first 007 adventure by Ian Fleming - now with a new introduction by Anthony Horowitz, author... Action & Adventure (Books) Crime, Thriller & Mystery 359.700 2.556 Med
4202 4202 Casino Royale: James Bond 007 (Vintage) Ian Fleming Paperback,– 2 Aug 2012 4.6 out of 5 stars 14 customer reviews The first 007 adventure by Ian Fleming - now with a new introduction by Anthony Horowitz, author... Action & Adventure (Books) Romance 359.900 2.556 Med
4423 4423 Casino Royale: James Bond 007 (Vintage) Ian Fleming Paperback,– 2 Aug 2012 4.6 out of 5 stars 14 customer reviews The first 007 adventure by Ian Fleming - now with a new introduction by Anthony Horowitz, author... Action & Adventure (Books) Action & Adventure 335.000 2.525 Med
5162 5162 Casino Royale: James Bond 007 (Vintage) Ian Fleming Paperback,– 2 Aug 2012 4.6 out of 5 stars 14 customer reviews The first 007 adventure by Ian Fleming - now with a new introduction by Anthony Horowitz, author... Action & Adventure (Books) Action & Adventure 346.680 2.540 Med
In [16]:
multi_price_titles = eda.get_multi_value_keys(merge_dataset,'Title','Price')
logger.debug(f'Multi price titles are {multi_price_titles}')
DEBUG:root:Multi price titles are Title
The Elements of Style                                                      4
Casino Royale: James Bond 007 (Vintage)                                    4
Murder in a Minute                                                         3
The World of Ice and Fire (Song of Ice & Fire)                             3
Fall of Giants (The Century Trilogy)                                       3
                                                                          ..
Tarzan: The Complete Russ Manning Newspaper Strips Volume 4 (1974-1979)    2
Teenage Diaries - The Days That Were                                       2
Temple                                                                     2
Tender is the Night                                                        2
#GIRLBOSS                                                                  2
Name: Price, Length: 567, dtype: int64
There are 567 multivalue keys 

In [17]:
# # Averaging price for observation with all other same fields.
# ivs = list(dataset.columns.drop('Price'))
# logger.debug(f'Columns are {ivs}')
# dataset = dataset.groupby(ivs)['Price'].mean().reset_index()
# logger.\
# debug(f"Grouped merge data is \n {dataset[dataset['Title']== 'Casino Royale: James Bond 007 (Vintage)']}")
In [18]:
wc = eda.create_word_cloud(merge_dataset['Title'],\
                      stop_words=['book','books'])

Below is the distribution of Sentence length

In [19]:
merge_dataset['title_len'] = merge_dataset['Title'].apply(lambda x: len(x.split()))

train_condition = merge_dataset['LogPrice'] != -1 
fig = px.box(x=merge_dataset.loc[train_condition,'title_len'],\
                 y=merge_dataset.loc[train_condition,'Price'],\
             log_y=True)
fig.show()

Author

In [20]:
title_by_author = merge_dataset.groupby('Title').agg({'Author':'nunique'}).reset_index()
titles = title_by_author.loc[title_by_author['Author'] >1,:]
logger.debug(f'Titles with more than 1 author names \n{titles}')
DEBUG:root:Titles with more than 1 author names 
                                                                                                    Title  \
398   American Short Story Masterpieces: A Rich Selection of Recent Fiction from America's Best Modern...   
520                                                                               Artificial Intelligence   
731                                                                                     Between the Lines   
1033                                                                                          Che Guevara   
2400  Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things...   
2570                                                                                    Infinity Gauntlet   
2588                                                                              International Relations   
2646                                                                                                   It   
2726                                                                                        Jurassic Park   
2912                                                                             Learn In 30 Days Through   
3451                                                                                             My Story   
3587                                                                                Nothing Lasts Forever   
3936                                                                                     Political Theory   
4024                                                                                     Programming in C   
5205                                                                                The Elements of Style   
5456                                                                                    The Invisible Man   
5704                                                                              The Old Man and the Sea   
5994                                                                                 The Story of My Life   

      Author  
398        2  
520        2  
731        2  
1033       2  
2400       2  
2570       2  
2588       2  
2646       2  
2726       2  
2912       3  
3451       2  
3587       2  
3936       2  
4024       2  
5205       3  
5456       2  
5704       2  
5994       2  
In [21]:
# Handling inconsistencies in Author name
merge_dataset.loc[merge_dataset['Author'] == 'Tom Jenks',\
                  'Author'] = 'Raymond Carver, Tom Jenks'
merge_dataset.loc[merge_dataset['Title'] == 'The Old Man and the Sea',\
                  'Author'] = 'Ernest Hemingway'
merge_dataset.loc[merge_dataset['Title'] == 'The Elements of Style',\
                  'Author'] = 'William Strunk Jr., E. B. White'
merge_dataset.loc[merge_dataset['Title'] == 'Hyperbole and a Half: Unfortunate Situations, Flawed Coping Mechanisms, Mayhem, and Other Things That Happened',\
                  'Author'] = 'Alexandra Brosh'
In [22]:
merge_dataset['Author_clean'] = merge_dataset['Author'].apply(lambda x: re.sub('\s+|\.|\,','',x.lower()))
In [23]:
#Top Authors 

pareto = .192

train_dataset = merge_dataset[train_condition].copy()

train_dataset,top_authors = eda.bucketize_pareto(train_dataset,\
                                                 'Author_clean',\
                                                 'LogPriceBucket',\
                                                 pareto)
logger.debug(f'Top authors are \n {top_authors}')
DEBUG:root:Top authors are 
 ['agathachristie', 'dk', 'ladybird', 'herge', 'albertuderzo', 'jamespatterson', 'billwatterson', 'johngrisham', 'pgwodehouse', 'sidneysheldon', 'clivecussler', 'noraroberts', 'georgerrmartin', 'sophiekinsella', 'davidbaldacci', 'stephenking', 'leechild', 'wilbursmith', 'oliverbowden', 'daniellesteel', 'akiratoriyama', 'dreamlandpublications', 'frederickforsyth', 'jeffreyarcher', 'various', 'geronimostilton', "louisl'amour", 'matthewreilly', 'danielsilva', 'neilgaiman', 'kenfollett', 'jamesrollins', 'ruskinbond', 'michaelcrichton', 'alistairmaclean', 'danbrown', 'jimdavis', 'davpilkey', 'billbryson', 'robertludlum', 'apjabdulkalam', 'lucacaioli', 'hbr', 'jkrowling', 'trinitycollegelond', 'eiichirooda', 'harukimurakami', 'ianfleming', 'ceceliaahern', 'anantpai', 'renégoscinnyalbertuderzo', 'ramachandraguha', 'conniggulden', 'archiesuperstars', 'koheihorikoshi', 'nationalgeographic', 'maplepress', 'novoneelchakraborty', 'oxforddictionaries', 'satyajitray', 'ernesthemingway', 'tsugumiohba', 'runningpress', 'masashikishimoto', 'hidenorikusaka', 'juliadonaldson', 'nikitasingh', 'one', 'carolvorderman', 'anthonyhorowitz', 'robertgalbraith', 'michaelconnelly']
72 ids identified in the Top 19.2% pareto
In [24]:
merge_dataset['top_Author'] = merge_dataset.\
apply(lambda x: x['Author_clean'] if x['Author_clean'] in top_authors else 'other_Author',axis=1)

Addding more details to Author such as

  • No. of Titles published
  • Min,Mean,Max,Std of LogPrice
In [25]:
train_condition = merge_dataset['LogPrice'] != -1 
author_details = (merge_dataset[train_condition].\
                  groupby('Author_clean').\
                  agg({'Title':'count',\
                       'LogPrice':[np.amin,\
                                   np.mean,\
                                   np.amax,\
                                   np.std]}).\
                  reset_index())

cols = ["Author" + "_" + str(i[0]) + "_" + str(i[1]) for i in author_details.columns]
author_details.columns = cols

#Dropping Books by Authors with less than 5 titles
author_details = author_details[author_details['Author_Title_count'] >= 2]
In [26]:
logger.debug(author_details)
DEBUG:root:            Author_Author_clean_  Author_Title_count  Author_LogPrice_amin  \
9                   aatishtaseer                   2                 2.572   
19    abhinavbindrarohitbrijnath                   2                 2.459   
20            abhinavchandrachud                   2                 2.544   
22                 abirmukherjee                   4                 1.991   
23                         abrsm                   2                 2.769   
...                          ...                 ...                   ...   
3607               yashodharalal                   3                 2.037   
3620               yoshitokioima                   2                 2.740   
3627             yuvalnoahharari                   3                 2.025   
3632            zagentertainment                   3                 2.820   
3638           zlatanibrahimovic                   2                 2.663   

      Author_LogPrice_mean  Author_LogPrice_amax  Author_LogPrice_std  
9                    2.572                 2.572                0.000  
19                   2.468                 2.477                0.013  
20                   2.544                 2.544                0.000  
22                   2.184                 2.511                0.248  
23                   2.772                 2.775                0.004  
...                    ...                   ...                  ...  
3607                 2.138                 2.273                0.121  
3620                 2.755                 2.769                0.021  
3627                 2.409                 2.668                0.339  
3632                 2.821                 2.823                0.002  
3638                 2.793                 2.923                0.184  

[890 rows x 6 columns]
In [27]:
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
                         author_details,\
                         left_on ='Author_clean',\
                         right_on = 'Author_Author_clean_',\
                         how='left')
logger.debug(merge_dataset.shape)
DEBUG:root:(7797, 15)
DEBUG:root:(7797, 21)
In [28]:
merge_dataset.drop('Author_Author_clean_',inplace=True,axis=1)
In [29]:
eda.explain_data(merge_dataset)
The data has 7797 rows and 20 columns
Below are the column wise data-types,missing values, unique level and descriptive stats of the data
Out[29]:
dtypes missing_values unique_values count unique top freq mean std min 25% 50% 75% max
index int64 0 6237 7797.000 NaN NaN NaN 2650.120 1873.325 0.000 974.000 2338.000 4287.000 6236.000
Title object 0 6787 7797 6787 Casino Royale: James Bond 007 (Vintage) 4 NaN NaN NaN NaN NaN NaN NaN
Author object 0 4367 7797 4367 Agatha Christie 87 NaN NaN NaN NaN NaN NaN NaN
Edition object 0 3882 7797 3882 Paperback,– 5 Oct 2017 60 NaN NaN NaN NaN NaN NaN NaN
Reviews object 0 36 7797 36 5.0 out of 5 stars 1751 NaN NaN NaN NaN NaN NaN NaN
Ratings object 0 368 7797 368 1 customer review 1328 NaN NaN NaN NaN NaN NaN NaN
Synopsis object 0 6771 7797 6771 A Tinkle Double Digest is two Tinkle Digests in one volume. These include the best stories of Ti... 8 NaN NaN NaN NaN NaN NaN NaN
Genre object 0 363 7797 363 Action & Adventure (Books) 1183 NaN NaN NaN NaN NaN NaN NaN
BookCategory object 0 11 7797 11 Action & Adventure 1036 NaN NaN NaN NaN NaN NaN NaN
Price float64 0 1615 7797.000 NaN NaN NaN 448.323 656.854 -1.000 134.000 309.000 504.000 14100.000
LogPrice float64 0 1615 7797.000 NaN NaN NaN 1.881 1.471 -1.000 2.127 2.490 2.702 4.149
LogPriceBucket object 0 6 7797 6 Target 1560 NaN NaN NaN NaN NaN NaN NaN
title_len int64 0 31 7797.000 NaN NaN NaN 7.141 4.378 1.000 4.000 6.000 9.000 32.000
Author_clean object 0 4326 7797 4326 agathachristie 87 NaN NaN NaN NaN NaN NaN NaN
top_Author object 0 73 7797 73 other_Author 6327 NaN NaN NaN NaN NaN NaN NaN
Author_Title_count float64 3725 29 4072.000 NaN NaN NaN 10.893 14.031 2.000 2.000 5.000 13.000 69.000
Author_LogPrice_amin float64 3725 504 4072.000 NaN NaN NaN 2.387 0.321 1.398 2.182 2.412 2.569 3.621
Author_LogPrice_mean float64 3725 870 4072.000 NaN NaN NaN 2.554 0.275 1.669 2.394 2.536 2.708 3.621
Author_LogPrice_amax float64 3725 549 4072.000 NaN NaN NaN 2.772 0.327 1.716 2.572 2.733 2.952 3.999
Author_LogPrice_std float64 3725 735 4072.000 NaN NaN NaN 0.133 0.112 0.000 0.050 0.121 0.187 0.832
In [30]:
condition = ((merge_dataset['Price'] != -1))
fig = px.box(merge_dataset[condition],\
             x='top_Author',\
             y='Price',
             log_y=True)
fig.show()
In [206]:
condition = ((merge_dataset['Price'] != -1))
fig = px.scatter(merge_dataset[condition],\
             x='Author_LogPrice_mean',\
             y='Price',
             color = 'Author_Title_count',
             log_y=True)
fig.show()
In [32]:
outlier_upper_bound = merge_dataset['Author_LogPrice_mean'] + 4*merge_dataset['Author_LogPrice_std']
outlier_lower_bound = merge_dataset['Author_LogPrice_mean'] - 4*merge_dataset['Author_LogPrice_std']
condition = ((merge_dataset['LogPrice'] > outlier_upper_bound) &\
             (merge_dataset['Author_LogPrice_std'] > 0))

logger.debug(f"Outliers are {merge_dataset[condition].shape}")
logger.debug(f"\n{merge_dataset[condition]}")
DEBUG:root:Outliers are (3, 20)
DEBUG:root:
      index                                                   Title  \
3101   3101                                     Tell Me Your Dreams   
3655   3655                                               Key Words   
3875   3875  While the Light Lasts (The Agatha Christie Collection)   

               Author                          Edition             Reviews  \
3101   Sidney Sheldon   Hardcover,– Import, 5 Oct 1998  4.4 out of 5 stars   
3655         Ladybird  Hardcover,– Box set, 2 Aug 2009  4.8 out of 5 stars   
3875  Agatha Christie    Hardcover,– Facsimile, Import  3.5 out of 5 stars   

                   Ratings  \
3101  175 customer reviews   
3655    7 customer reviews   
3875    7 customer reviews   

                                                                                                 Synopsis  \
3101  The fast-paced new novel from the internationally bestselling author of The Best Laid Plans, Mor...   
3655  This sturdy boxset contains the first six books from the Key Words with Peter and Jane reading s...   
3875  The very last Agatha Christie book, including some of her earliest stories – including her very ...   

                                        Genre  \
3101               Action & Adventure (Books)   
3655  Language, Linguistics & Writing (Books)   
3875                    Short Stories (Books)   

                         BookCategory    Price  LogPrice LogPriceBucket  \
3101               Action & Adventure 1290.660     3.111      Very High   
3655  Language, Linguistics & Writing  517.000     2.713           High   
3875        Crime, Thriller & Mystery  896.000     2.952      Very High   

      title_len    Author_clean      top_Author  Author_Title_count  \
3101          4   sidneysheldon   sidneysheldon              28.000   
3655          2        ladybird        ladybird              50.000   
3875          8  agathachristie  agathachristie              69.000   

      Author_LogPrice_amin  Author_LogPrice_mean  Author_LogPrice_amax  \
3101                 2.167                 2.394                 3.111   
3655                 1.708                 1.966                 2.713   
3875                 2.188                 2.357                 2.952   

      Author_LogPrice_std  
3101                0.163  
3655                0.184  
3875                0.130  

Edition

This format of the Edition where (Language),Bind,-Sourcing,Day,Month,Year

In [33]:
logger.debug(f"Top 5 records in Edition are \n {merge_dataset['Edition'].head()}")
DEBUG:root:Top 5 records in Edition are 
 0    Paperback,– 10 Mar 2016
1     Paperback,– 7 Nov 2012
2    Paperback,– 25 Feb 1982
3     Paperback,– 5 Oct 2017
4    Hardcover,– 10 Oct 2006
Name: Edition, dtype: object
In [34]:
pattern = '(?:(\(\w+\)),)?(\w+(?:-\w+)?(?:\s+\w+)?(?:\s+\w+)?),–\s+(?:(\w+(?:\s+\w+)?),\s+)?(?:(\d+)\s+)?(?:(\w+)\s+)?(?:(\d+))?'

edition = merge_dataset['Edition'].\
apply(lambda x: re.findall(pattern,x))
logger.debug(f'Glimpse of edition \n {edition}')
DEBUG:root:Glimpse of edition 
 0       [(, Paperback, , 10, Mar, 2016)]
1        [(, Paperback, , 7, Nov, 2012)]
2       [(, Paperback, , 25, Feb, 1982)]
3        [(, Paperback, , 5, Oct, 2017)]
4       [(, Hardcover, , 10, Oct, 2006)]
                      ...               
7792    [(, Paperback, , 14, Apr, 2011)]
7793     [(, Paperback, , 8, May, 2013)]
7794     [(, Paperback, , 6, Sep, 2011)]
7795    [(, Paperback, , 22, Sep, 2009)]
7796    [(, Paperback, , 16, Sep, 2016)]
Name: Edition, Length: 7797, dtype: object
In [35]:
lan = []
bind = []
sour = []
month = []
year = []

for i in edition:
  lan.append(i[0][0])
  bind.append(i[0][1])
  sour.append(i[0][2])
  month.append(i[0][4])
  year.append(i[0][5])

logger.debug(f'Unique value for lan are {np.unique(lan)}')
logger.debug(f'Unique value for bind are {np.unique(bind)}')
logger.debug(f'Unique value for sour are {np.unique(sour)}')
logger.debug(f'Unique value for month are {np.unique(month)}')
logger.debug(f'Unique value for year are {np.unique(year)}')

edition_details = pd.DataFrame(list(zip(bind,\
                                        sour,\
                                        month,\
                                        year)),\
                               columns = ['bind',\
                                          'sour',\
                                          'month',\
                                          'year'])
DEBUG:root:Unique value for lan are ['' '(Chinese)' '(French)' '(German)' '(Kannada)' '(Spanish)']
DEBUG:root:Unique value for bind are ['Board book' 'Cards' 'Flexibound' 'Hardcover' 'Leather Bound'
 'Library Binding' 'Loose Leaf' 'Mass Market Paperback' 'Paperback'
 'Perfect Paperback' 'Plastic Comb' 'Product Bundle' 'Sheet music'
 'Spiral-bound' 'Tankobon Softcover']
DEBUG:root:Unique value for sour are ['' 'ADPCM' 'Abridged' 'Audiobook' 'Bargain Price' 'Box set' 'DVD'
 'Deckle Edge' 'Deluxe Edition' 'EveryBook' 'Facsimile' 'Illustrated'
 'Import' 'International Edition' 'Kindle eBook' 'Large Print' 'Print'
 'Special Edition' 'Student Edition' 'Unabridged']
DEBUG:root:Unique value for month are ['' 'Apr' 'Aug' 'Box' 'Dec' 'Feb' 'Jan' 'Jul' 'Jun' 'Large' 'Mar' 'May'
 'Nov' 'Oct' 'Sep' 'Special']
DEBUG:root:Unique value for year are ['' '1900' '1905' '1925' '1942' '1960' '1961' '1964' '1969' '1970' '1971'
 '1972' '1973' '1974' '1975' '1976' '1977' '1978' '1979' '1980' '1981'
 '1982' '1983' '1984' '1985' '1986' '1987' '1988' '1989' '1990' '1991'
 '1992' '1993' '1994' '1995' '1996' '1997' '1998' '1999' '2000' '2001'
 '2002' '2003' '2004' '2005' '2006' '2007' '2008' '2009' '2010' '2011'
 '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019']
In [36]:
# Correcting Regex Error
edition_details.loc[edition_details['sour'].\
                isin(['']),'sour'] = 'Other_sour'
edition_details.loc[edition_details['month'].\
                isin(['Box','Large','Special','']),'month'] = 'Unknown'
edition_details.loc[edition_details['year'].\
                isin(['']),'year'] = '2018'
In [37]:
edition_details['month'].unique()
Out[37]:
array(['Mar', 'Nov', 'Feb', 'Oct', 'May', 'Dec', 'Jan', 'Jun', 'Jul',
       'Sep', 'Unknown', 'Aug', 'Apr'], dtype=object)
In [38]:
month_label = dict({'Mar': 3,\
                    'Nov' : 11,\
                    'Feb' : 2,\
                    'Oct' : 10,\
                    'May' : 5,\
                    'Dec' : 12,\
                    'Jan' : 1,\
                    'Jun' : 6,\
                    'Jul' : 7,\
                    'Sep' : 9,\
                    'Unknown': 0,\
                    'Aug' : 8,
                    'Apr': 4})

edition_details['month'].replace(month_label,inplace=True)
In [39]:
merge_dataset = pd.concat([merge_dataset,\
                           edition_details],axis=1)
In [40]:
eda.explain_data(merge_dataset)
The data has 7797 rows and 24 columns
Below are the column wise data-types,missing values, unique level and descriptive stats of the data
Out[40]:
dtypes missing_values unique_values count unique top freq mean std min 25% 50% 75% max
index int64 0 6237 7797.000 NaN NaN NaN 2650.120 1873.325 0.000 974.000 2338.000 4287.000 6236.000
Title object 0 6787 7797 6787 Casino Royale: James Bond 007 (Vintage) 4 NaN NaN NaN NaN NaN NaN NaN
Author object 0 4367 7797 4367 Agatha Christie 87 NaN NaN NaN NaN NaN NaN NaN
Edition object 0 3882 7797 3882 Paperback,– 5 Oct 2017 60 NaN NaN NaN NaN NaN NaN NaN
Reviews object 0 36 7797 36 5.0 out of 5 stars 1751 NaN NaN NaN NaN NaN NaN NaN
Ratings object 0 368 7797 368 1 customer review 1328 NaN NaN NaN NaN NaN NaN NaN
Synopsis object 0 6771 7797 6771 A Tinkle Double Digest is two Tinkle Digests in one volume. These include the best stories of Ti... 8 NaN NaN NaN NaN NaN NaN NaN
Genre object 0 363 7797 363 Action & Adventure (Books) 1183 NaN NaN NaN NaN NaN NaN NaN
BookCategory object 0 11 7797 11 Action & Adventure 1036 NaN NaN NaN NaN NaN NaN NaN
Price float64 0 1615 7797.000 NaN NaN NaN 448.323 656.854 -1.000 134.000 309.000 504.000 14100.000
LogPrice float64 0 1615 7797.000 NaN NaN NaN 1.881 1.471 -1.000 2.127 2.490 2.702 4.149
LogPriceBucket object 0 6 7797 6 Target 1560 NaN NaN NaN NaN NaN NaN NaN
title_len int64 0 31 7797.000 NaN NaN NaN 7.141 4.378 1.000 4.000 6.000 9.000 32.000
Author_clean object 0 4326 7797 4326 agathachristie 87 NaN NaN NaN NaN NaN NaN NaN
top_Author object 0 73 7797 73 other_Author 6327 NaN NaN NaN NaN NaN NaN NaN
Author_Title_count float64 3725 29 4072.000 NaN NaN NaN 10.893 14.031 2.000 2.000 5.000 13.000 69.000
Author_LogPrice_amin float64 3725 504 4072.000 NaN NaN NaN 2.387 0.321 1.398 2.182 2.412 2.569 3.621
Author_LogPrice_mean float64 3725 870 4072.000 NaN NaN NaN 2.554 0.275 1.669 2.394 2.536 2.708 3.621
Author_LogPrice_amax float64 3725 549 4072.000 NaN NaN NaN 2.772 0.327 1.716 2.572 2.733 2.952 3.999
Author_LogPrice_std float64 3725 735 4072.000 NaN NaN NaN 0.133 0.112 0.000 0.050 0.121 0.187 0.832
bind object 0 15 7797 15 Paperback 6463 NaN NaN NaN NaN NaN NaN NaN
sour object 0 20 7797 20 Other_sour 6808 NaN NaN NaN NaN NaN NaN NaN
month int64 0 13 7797.000 NaN NaN NaN 6.195 3.651 0.000 3.000 6.000 9.000 12.000
year object 0 58 7797 58 2018 1048 NaN NaN NaN NaN NaN NaN NaN

Definite pricing trend observed with respect to bind type, Spiral bound being expensive as compared to Hardcover and Paperback. Mass Market Paperback being the cheapest alternative for all 4.

In [41]:
fig = px.box(merge_dataset[merge_dataset['Price'] != -1],\
             x='bind',\
             y='Price',\
             log_y=True)
fig.show()
In [42]:
#Top Binds 

pareto = .995

train_dataset = merge_dataset[train_condition].copy()

train_dataset,top_binds = eda.bucketize_pareto(train_dataset,\
                                                 'bind',\
                                                 'LogPriceBucket',\
                                                 pareto)
4 ids identified in the Top 99.5% pareto
In [43]:
merge_dataset['top_bind'] = merge_dataset.\
apply(lambda x: x['bind'] if x['bind'] in top_binds else 'other_bind',axis=1)
In [44]:
train_condition = merge_dataset['LogPrice'] != -1 
bind_details = (merge_dataset[train_condition].\
                  groupby('bind').\
                  agg({'Title':'count',\
                       'LogPrice':[np.amin,\
                                   np.mean,\
                                   np.amax,\
                                   np.std]}).\
                  reset_index())

cols = ["bind" + "_" + str(i[0]) + "_" + str(i[1]) for i in bind_details.columns]
bind_details.columns = cols
#Dropping Books by bind with less than 5 titles
bind_details = bind_details[bind_details['bind_Title_count'] >= 2]
In [45]:
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
                         bind_details,\
                         left_on='bind',\
                         right_on = 'bind_bind_',\
                         how='left'
                         )
logger.debug(merge_dataset.shape)
DEBUG:root:(7797, 25)
DEBUG:root:(7797, 31)
In [46]:
merge_dataset.drop('bind_bind_',inplace=True,axis=1)
In [47]:
#Top Sour 

pareto = .985

train_dataset = merge_dataset[train_condition].copy()

train_dataset,top_binds = eda.bucketize_pareto(train_dataset,\
                                                 'sour',\
                                                 'LogPriceBucket',\
                                                 pareto)
4 ids identified in the Top 98.5% pareto
In [48]:
merge_dataset['top_sour'] = merge_dataset.\
apply(lambda x: x['sour'] if x['sour'] in top_binds else 'other_sour',axis=1)
In [49]:
train_condition = merge_dataset['LogPrice'] != -1 
sour_details = (merge_dataset[train_condition].\
                  groupby('sour').\
                  agg({'Title':'count',\
                       'LogPrice':[np.amin,\
                                   np.mean,\
                                   np.amax,\
                                   np.std]}).\
                  reset_index())

cols = ["sour" + "_" + str(i[0]) + "_" + str(i[1]) for i in sour_details.columns]
sour_details.columns = cols
#Dropping Books by bind with less than 5 titles
sour_details = sour_details[sour_details['sour_Title_count'] >= 2]
In [50]:
logger.debug(sour_details)
DEBUG:root:               sour_sour_  sour_Title_count  sour_LogPrice_amin  \
1                Abridged                15               2.255   
2               Audiobook                 4               2.228   
4                 Box set                11               2.712   
6             Deckle Edge                 7               1.996   
9               Facsimile                 3               2.476   
10            Illustrated                53               2.090   
11                 Import               616               1.602   
12  International Edition                10               2.274   
14            Large Print                 8               2.097   
15             Other_sour              5451               1.398   
16                  Print                 3               1.996   
17        Special Edition                18               2.068   
18        Student Edition                14               2.199   
19             Unabridged                18               1.833   

    sour_LogPrice_mean  sour_LogPrice_amax  sour_LogPrice_std  
1                2.675               3.352              0.338  
2                2.954               3.428              0.531  
4                3.159               3.601              0.339  
6                2.542               3.150              0.440  
9                2.922               3.339              0.433  
10               2.729               3.314              0.317  
11               2.665               4.122              0.365  
12               2.499               2.640              0.125  
14               2.494               2.951              0.298  
15               2.591               4.149              0.326  
16               2.466               3.045              0.533  
17               2.685               3.061              0.270  
18               2.651               3.704              0.423  
19               2.654               3.463              0.437  
In [51]:
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
                         sour_details,\
                         left_on='sour',\
                         right_on = 'sour_sour_',\
                         how='left'
                         )
logger.debug(merge_dataset.shape)
DEBUG:root:(7797, 31)
DEBUG:root:(7797, 37)
In [52]:
merge_dataset.drop('sour_sour_',inplace=True,axis=1)

The price range increases and gets more wide as year of publication increases.

In [53]:
condition = ((merge_dataset['LogPrice'] != -1) &\
             (merge_dataset['month'].isin(['Jan','Feb'])))
In [207]:
year_month_price = (merge_dataset[condition].\
                    groupby(['year',\
                             'month'])['Price'].\
                    mean().\
                    reset_index())

fig = px.scatter(year_month_price,\
             x='year',\
             y='Price',\
             color='month',\
             log_y=True,
              range_x=(1980,2021))
fig.show()

Reviews

In [55]:
pattern = '(\d+.\d+)\s+out of 5 stars'

reviews = merge_dataset['Reviews'].\
apply(lambda x: re.findall(pattern,x))

review_num = []

for i in reviews:
  review_num.append(i[0])

merge_dataset['review_num'] = review_num
merge_dataset['review_num'] = merge_dataset['review_num'].astype('float64')

Reviews is not neccesarily dependent on the Price and is almost evenly spread around reviews

In [56]:
fig = px.box(merge_dataset[merge_dataset['Price'] != -1],\
             x='review_num',\
             y='Price',\
             log_y=True)
fig.show()

Ratings

In [57]:
merge_dataset['rating_num'] = (merge_dataset['Ratings'].\
                               apply(lambda x: x.split()[0]).
                               apply(lambda x: x.replace(',','')))
merge_dataset['rating_num'] = merge_dataset['rating_num'].astype('float64')

Popular books with High Rating have specific price range, hinting towards the most selling price range

In [208]:
fig = px.scatter(merge_dataset[merge_dataset['Price'] != -1],\
             x='rating_num',\
             y='Price',\
             log_y=True,
             log_x=True)
fig.show()
In [59]:
merge_dataset['rating_num'] = merge_dataset['rating_num'].\
apply(lambda x: np.log10(x+10))

Genre

In [60]:
#Top Genres 

pareto = .48

train_dataset = merge_dataset[train_condition].copy()

train_dataset,top_genres = eda.bucketize_pareto(train_dataset,\
                                                 'Genre',\
                                                 'LogPriceBucket',\
                                                 pareto)
10 ids identified in the Top 48.0% pareto
In [61]:
merge_dataset['top_Genre'] = merge_dataset.\
apply(lambda x: x['Genre'] if x['Genre'] in top_genres else 'other_Genre',axis=1)
In [62]:
train_condition = merge_dataset['LogPrice'] != -1 
Genre_details = (merge_dataset[train_condition].\
                  groupby('Genre').\
                  agg({'Title':'count',\
                       'LogPrice':[np.amin,\
                                   np.mean,\
                                   np.amax,\
                                   np.std]}).\
                  reset_index())

cols = ["Genre" + "_" + str(i[0]) + "_" + str(i[1]) for i in Genre_details.columns]
Genre_details.columns = cols

#Dropping Books by Authors with less than 5 titles
Genre_details = Genre_details[Genre_details['Genre_Title_count'] >= 2]
In [63]:
logger.debug(Genre_details)
DEBUG:root:                                  Genre_Genre_  Genre_Title_count  \
0                 API & Operating Environments                  2   
1                   Action & Adventure (Books)                947   
5                Agriculture & Farming (Books)                  6   
7                       Algebra & Trigonometry                  2   
8                                   Algorithms                 16   
..                                         ...                ...   
333                                Visual Arts                  5   
335                           Vocabulary Books                  7   
340  World African & Middle Eastern Literature                  2   
341                     Writing Guides (Books)                 71   
342                 XHTML Software Programming                  2   

     Genre_LogPrice_amin  Genre_LogPrice_mean  Genre_LogPrice_amax  \
0                  3.000                3.201                3.403   
1                  1.699                2.521                3.748   
5                  2.519                2.675                3.150   
7                  2.931                3.160                3.389   
8                  2.149                2.854                3.686   
..                   ...                  ...                  ...   
333                1.398                2.370                3.274   
335                1.839                2.393                2.907   
340                2.899                2.899                2.899   
341                1.756                2.727                3.548   
342                2.777                3.167                3.557   

     Genre_LogPrice_std  
0                 0.285  
1                 0.242  
5                 0.237  
7                 0.324  
8                 0.427  
..                  ...  
333               0.869  
335               0.372  
340               0.000  
341               0.293  
342               0.551  

[256 rows x 6 columns]
In [64]:
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
                         Genre_details,\
                         left_on='Genre',\
                         right_on = 'Genre_Genre_',\
                         how='left'
                         )
logger.debug(merge_dataset.shape)
DEBUG:root:(7797, 39)
DEBUG:root:(7797, 45)
In [65]:
merge_dataset.drop('Genre_Genre_',inplace=True,axis=1)
In [209]:
condition = ((merge_dataset['Price'] != -1) &\
             (merge_dataset['top_Genre'] != 'other_Genre'))
fig = px.scatter(merge_dataset[condition],\
             x='Price',\
             color='top_Genre',
             log_x=True)
fig.show()
In [210]:
condition = ((merge_dataset['Price'] != -1))
fig = px.scatter(merge_dataset[condition],\
             x='Genre_LogPrice_mean',\
             y='Price',
             color = 'Genre_Title_count',
             log_y=True)
fig.show()

BookCategory

In [211]:
condition = (merge_dataset['Price'] != -1)
fig = px.scatter(merge_dataset[condition],\
             color='BookCategory',\
             x='Price',
             log_x=True)
fig.show()
In [69]:
train_condition = merge_dataset['LogPrice'] != -1 
BookCat_details = (merge_dataset[train_condition].\
                  groupby('BookCategory').\
                  agg({'Title':'count',\
                       'LogPrice':[np.amin,\
                                   np.mean,\
                                   np.amax,\
                                   np.std]}).\
                  reset_index())

cols = ["BookCategory" + "_" + str(i[0]) + "_" + str(i[1]) for i in BookCat_details.columns]
BookCat_details.columns = cols
In [70]:
logger.debug(BookCat_details)
DEBUG:root:              BookCategory_BookCategory_  BookCategory_Title_count  \
0                     Action & Adventure                       818   
1               Arts, Film & Photography                       517   
2   Biographies, Diaries & True Accounts                       596   
3                        Comics & Mangas                       583   
4    Computing, Internet & Digital Media                       510   
5              Crime, Thriller & Mystery                       723   
6                                 Humour                       540   
7        Language, Linguistics & Writing                       594   
8                               Politics                       325   
9                                Romance                       560   
10                                Sports                       471   

    BookCategory_LogPrice_amin  BookCategory_LogPrice_mean  \
0                        1.699                       2.530   
1                        1.398                       2.773   
2                        1.398                       2.478   
3                        1.748                       2.714   
4                        1.898                       2.833   
5                        1.833                       2.456   
6                        1.833                       2.697   
7                        1.477                       2.487   
8                        1.653                       2.613   
9                        1.568                       2.437   
10                       1.977                       2.750   

    BookCategory_LogPrice_amax  BookCategory_LogPrice_std  
0                        3.748                      0.250  
1                        4.069                      0.358  
2                        3.438                      0.240  
3                        4.149                      0.349  
4                        3.764                      0.354  
5                        3.438                      0.226  
6                        3.699                      0.309  
7                        3.714                      0.386  
8                        4.122                      0.261  
9                        3.999                      0.281  
10                       3.863                      0.307  
In [71]:
logger.debug(merge_dataset.shape)
merge_dataset = pd.merge(merge_dataset,\
                         BookCat_details,\
                         left_on='BookCategory',\
                         right_on = 'BookCategory_BookCategory_',\
                         how='left'
                         )
logger.debug(merge_dataset.shape)
DEBUG:root:(7797, 44)
DEBUG:root:(7797, 50)
In [72]:
merge_dataset.drop('BookCategory_BookCategory_',inplace=True,axis=1)

Synopsis

In [73]:
wc = eda.create_word_cloud(merge_dataset['Synopsis'])
In [74]:
merge_dataset['synopsis_len'] = merge_dataset['Synopsis'].apply(lambda x: len(x.split()))

fig = px.box(y=merge_dataset['synopsis_len'])
fig.show()

Data pre-processing

Missing values
In [75]:
logger.debug(f"There are {merge_dataset.isnull().sum().sum()} missing values")
logger.debug(f"Columns with missing values are \n{merge_dataset.\
                                                isnull().\
                                                sum()[merge_dataset.\
                                                      isnull().\
                                                      sum()>0]}")
DEBUG:root:There are 19360 missing values
DEBUG:root:Columns with missing values are 
Author_Title_count      3725
Author_LogPrice_amin    3725
Author_LogPrice_mean    3725
Author_LogPrice_amax    3725
Author_LogPrice_std     3725
bind_Title_count           6
bind_LogPrice_amin         6
bind_LogPrice_mean         6
bind_LogPrice_amax         6
bind_LogPrice_std          6
sour_Title_count           7
sour_LogPrice_amin         7
sour_LogPrice_mean         7
sour_LogPrice_amax         7
sour_LogPrice_std          7
Genre_Title_count        134
Genre_LogPrice_amin      134
Genre_LogPrice_mean      134
Genre_LogPrice_amax      134
Genre_LogPrice_std       134
dtype: int64

NA in above values suggest absence of respective fields in training data.

  • Count columns can be replaced by 0 since these could be the first Book by an Author or in the Genre
  • Std is null because there is either 1 or no entry about the book hence can be filled by 0
  • Replacing min,mean,max for Author and Genre can be done by taking Average of other categories such as Bind,Sour,BookCategory to comeup with the value to impute missing values.
In [76]:
replace_zero = ['Author_Title_count',\
                'Author_LogPrice_std',\
                'bind_Title_count',\
                'bind_LogPrice_std',\
                'sour_Title_count',\
                'sour_LogPrice_std',\
                'Genre_Title_count',\
                'Genre_LogPrice_std']
merge_dataset.loc[:,replace_zero] = merge_dataset.loc[:,replace_zero].fillna(0)
In [77]:
def impute_values(df,impute_vals,impute_cols,func):
  if func == 'min':
    df['impute_vals'] = df.loc[:,impute_vals].min(axis=1)
  if func == 'mean':
    df['impute_vals'] = df.loc[:,impute_vals].mean(axis=1)
  if func == 'max':
    df['impute_vals'] = df.loc[:,impute_vals].max(axis=1)
  
  for col in impute_cols:
    df[col] = df.apply(lambda x: x['impute_vals'] if np.isnan(x[col]) else x[col],\
                         axis=1)
  df.drop('impute_vals',inplace=True,axis=1)
  return df
In [78]:
impute_vals = ['BookCategory_LogPrice_amin']
impute_cols = ['Author_LogPrice_amin',\
               'Genre_LogPrice_amin',\
               'sour_LogPrice_amin',\
               'bind_LogPrice_amin']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'min')
impute_vals = ['BookCategory_LogPrice_mean']
impute_cols = ['Author_LogPrice_mean',\
               'Genre_LogPrice_mean',\
               'sour_LogPrice_mean',\
               'bind_LogPrice_mean']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'mean')
impute_vals = ['BookCategory_LogPrice_amax']
impute_cols = ['Author_LogPrice_amax',\
               'Genre_LogPrice_amax',\
               'sour_LogPrice_amax',\
               'bind_LogPrice_amax']
merge_dataset = impute_values(merge_dataset,impute_vals,impute_cols,'max')
In [79]:
eda.explain_data(merge_dataset)
The data has 7797 rows and 50 columns
Below are the column wise data-types,missing values, unique level and descriptive stats of the data
Out[79]:
dtypes missing_values unique_values count unique top freq mean std min 25% 50% 75% max
index int64 0 6237 7797.000 NaN NaN NaN 2650.120 1873.325 0.000 974.000 2338.000 4287.000 6236.000
Title object 0 6787 7797 6787 Casino Royale: James Bond 007 (Vintage) 4 NaN NaN NaN NaN NaN NaN NaN
Author object 0 4367 7797 4367 Agatha Christie 87 NaN NaN NaN NaN NaN NaN NaN
Edition object 0 3882 7797 3882 Paperback,– 5 Oct 2017 60 NaN NaN NaN NaN NaN NaN NaN
Reviews object 0 36 7797 36 5.0 out of 5 stars 1751 NaN NaN NaN NaN NaN NaN NaN
Ratings object 0 368 7797 368 1 customer review 1328 NaN NaN NaN NaN NaN NaN NaN
Synopsis object 0 6771 7797 6771 A Tinkle Double Digest is two Tinkle Digests in one volume. These include the best stories of Ti... 8 NaN NaN NaN NaN NaN NaN NaN
Genre object 0 363 7797 363 Action & Adventure (Books) 1183 NaN NaN NaN NaN NaN NaN NaN
BookCategory object 0 11 7797 11 Action & Adventure 1036 NaN NaN NaN NaN NaN NaN NaN
Price float64 0 1615 7797.000 NaN NaN NaN 448.323 656.854 -1.000 134.000 309.000 504.000 14100.000
LogPrice float64 0 1615 7797.000 NaN NaN NaN 1.881 1.471 -1.000 2.127 2.490 2.702 4.149
LogPriceBucket object 0 6 7797 6 Target 1560 NaN NaN NaN NaN NaN NaN NaN
title_len int64 0 31 7797.000 NaN NaN NaN 7.141 4.378 1.000 4.000 6.000 9.000 32.000
Author_clean object 0 4326 7797 4326 agathachristie 87 NaN NaN NaN NaN NaN NaN NaN
top_Author object 0 73 7797 73 other_Author 6327 NaN NaN NaN NaN NaN NaN NaN
Author_Title_count float64 0 30 7797.000 NaN NaN NaN 5.689 11.507 0.000 0.000 2.000 5.000 69.000
Author_LogPrice_amin float64 0 509 7797.000 NaN NaN NaN 2.051 0.444 1.398 1.699 1.977 2.418 3.621
Author_LogPrice_mean float64 0 881 7797.000 NaN NaN NaN 2.589 0.225 1.669 2.456 2.579 2.750 3.621
Author_LogPrice_amax float64 0 555 7797.000 NaN NaN NaN 3.262 0.586 1.716 2.713 3.408 3.764 4.149
Author_LogPrice_std float64 0 735 7797.000 NaN NaN NaN 0.070 0.105 0.000 0.000 0.000 0.127 0.832
bind object 0 15 7797 15 Paperback 6463 NaN NaN NaN NaN NaN NaN NaN
sour object 0 20 7797 20 Other_sour 6808 NaN NaN NaN NaN NaN NaN NaN
month int64 0 13 7797.000 NaN NaN NaN 6.195 3.651 0.000 3.000 6.000 9.000 12.000
year object 0 58 7797 58 2018 1048 NaN NaN NaN NaN NaN NaN NaN
top_bind object 0 5 7797 5 Paperback 6463 NaN NaN NaN NaN NaN NaN NaN
bind_Title_count float64 0 10 7797.000 NaN NaN NaN 4423.427 1707.001 0.000 5197.000 5197.000 5197.000 5197.000
bind_LogPrice_amin float64 0 11 7797.000 NaN NaN NaN 1.454 0.148 1.398 1.398 1.398 1.398 2.674
bind_LogPrice_mean float64 0 13 7797.000 NaN NaN NaN 2.602 0.088 2.478 2.569 2.569 2.569 2.932
bind_LogPrice_amax float64 0 13 7797.000 NaN NaN NaN 3.989 0.149 2.607 3.999 3.999 3.999 4.149
bind_LogPrice_std float64 0 10 7797.000 NaN NaN NaN 0.319 0.033 0.000 0.312 0.312 0.312 0.391
top_sour object 0 5 7797 5 Other_sour 6808 NaN NaN NaN NaN NaN NaN NaN
sour_Title_count float64 0 13 7797.000 NaN NaN NaN 4821.350 1654.396 0.000 5451.000 5451.000 5451.000 5451.000
sour_LogPrice_amin float64 0 15 7797.000 NaN NaN NaN 1.438 0.136 1.398 1.398 1.398 1.398 2.712
sour_LogPrice_mean float64 0 19 7797.000 NaN NaN NaN 2.602 0.038 2.456 2.591 2.591 2.591 3.159
sour_LogPrice_amax float64 0 18 7797.000 NaN NaN NaN 4.123 0.145 2.640 4.149 4.149 4.149 4.149
sour_LogPrice_std float64 0 15 7797.000 NaN NaN NaN 0.330 0.021 0.000 0.326 0.326 0.326 0.533
review_num float64 0 36 7797.000 NaN NaN NaN 4.296 0.664 1.000 4.000 4.400 4.800 5.000
rating_num float64 0 368 7797.000 NaN NaN NaN 1.355 0.376 1.041 1.079 1.204 1.491 3.785
top_Genre object 0 11 7797 11 other_Genre 4112 NaN NaN NaN NaN NaN NaN NaN
Genre_Title_count float64 0 60 7797.000 NaN NaN NaN 251.446 320.225 0.000 30.000 78.000 373.000 947.000
Genre_LogPrice_amin float64 0 185 7797.000 NaN NaN NaN 1.935 0.311 1.398 1.699 1.845 2.176 3.228
Genre_LogPrice_mean float64 0 267 7797.000 NaN NaN NaN 2.601 0.179 1.825 2.481 2.569 2.733 3.443
Genre_LogPrice_amax float64 0 244 7797.000 NaN NaN NaN 3.500 0.354 2.009 3.265 3.562 3.748 4.149
Genre_LogPrice_std float64 0 253 7797.000 NaN NaN NaN 0.272 0.082 0.000 0.236 0.269 0.311 1.002
BookCategory_Title_count int64 0 11 7797.000 NaN NaN NaN 592.889 122.073 325.000 517.000 583.000 596.000 818.000
BookCategory_LogPrice_amin float64 0 9 7797.000 NaN NaN NaN 1.681 0.187 1.398 1.477 1.699 1.833 1.977
BookCategory_LogPrice_mean float64 0 11 7797.000 NaN NaN NaN 2.603 0.137 2.437 2.478 2.530 2.714 2.833
BookCategory_LogPrice_amax float64 0 10 7797.000 NaN NaN NaN 3.794 0.236 3.438 3.699 3.748 3.999 4.149
BookCategory_LogPrice_std float64 0 11 7797.000 NaN NaN NaN 0.299 0.053 0.226 0.250 0.307 0.354 0.386
synopsis_len int64 0 486 7797.000 NaN NaN NaN 164.243 93.956 3.000 104.000 151.000 208.000 2164.000

Datatype conversion

In [80]:
logger.debug(merge_dataset.columns)
DEBUG:root:Index(['index', 'Title', 'Author', 'Edition', 'Reviews', 'Ratings', 'Synopsis',
       'Genre', 'BookCategory', 'Price', 'LogPrice', 'LogPriceBucket',
       'title_len', 'Author_clean', 'top_Author', 'Author_Title_count',
       'Author_LogPrice_amin', 'Author_LogPrice_mean', 'Author_LogPrice_amax',
       'Author_LogPrice_std', 'bind', 'sour', 'month', 'year', 'top_bind',
       'bind_Title_count', 'bind_LogPrice_amin', 'bind_LogPrice_mean',
       'bind_LogPrice_amax', 'bind_LogPrice_std', 'top_sour',
       'sour_Title_count', 'sour_LogPrice_amin', 'sour_LogPrice_mean',
       'sour_LogPrice_amax', 'sour_LogPrice_std', 'review_num', 'rating_num',
       'top_Genre', 'Genre_Title_count', 'Genre_LogPrice_amin',
       'Genre_LogPrice_mean', 'Genre_LogPrice_amax', 'Genre_LogPrice_std',
       'BookCategory_Title_count', 'BookCategory_LogPrice_amin',
       'BookCategory_LogPrice_mean', 'BookCategory_LogPrice_amax',
       'BookCategory_LogPrice_std', 'synopsis_len'],
      dtype='object')
In [81]:
target = 'LogPrice'
cat_cols = ['BookCategory',\
            'top_Author',\
            'top_bind',\
            'top_sour',\
            'top_Genre']
num_cols = ['title_len',\
            'Author_Title_count',\
            'Author_LogPrice_amin',\
            'Author_LogPrice_mean',\
            'Author_LogPrice_amax',\
            'Author_LogPrice_std',\
            'month',\
            'year',\
            'bind_Title_count',\
            'bind_LogPrice_amin',\
            'bind_LogPrice_mean',\
            'bind_LogPrice_amax',\
            'bind_LogPrice_std',\
            'sour_Title_count',\
            'sour_LogPrice_amin',\
            'sour_LogPrice_mean',\
            'sour_LogPrice_amax',\
            'sour_LogPrice_std',\
            'review_num',\
            'rating_num',\
            'Genre_Title_count',\
            'Genre_LogPrice_amin',\
            'Genre_LogPrice_mean',\
            'Genre_LogPrice_amax',\
            'Genre_LogPrice_std',\
            'BookCategory_Title_count',\
            'BookCategory_LogPrice_amin',\
            'BookCategory_LogPrice_mean',\
            'BookCategory_LogPrice_amax',\
            'BookCategory_LogPrice_std',\
            'synopsis_len']

text_cols = ['Title',\
             'Synopsis']

index_cols = ['index']

merge_dataset.loc[:,cat_cols] = merge_dataset[cat_cols].astype('category')
merge_dataset.loc[:,num_cols] = merge_dataset[num_cols].astype('float64')

Feature engineering

Adding sequence of Year within each book.As overtime, there will be some intelligence in Price build after the first book for the author is launched.

In [82]:
merge_dataset['year_month'] = merge_dataset['year']*100 + merge_dataset['month']
merge_dataset['title_seq'] = (merge_dataset.groupby(['Author'])['year_month'].rank())
author_title_max = (merge_dataset.\
                      groupby('Author')['title_seq'].\
                      max().\
                      reset_index())
author_title_max.columns = ['Author','title_seq_max']
merge_dataset = pd.merge(merge_dataset,\
                         author_title_max,\
                         on='Author')
merge_dataset['title_seq'] = merge_dataset['title_seq']/merge_dataset['title_seq_max']
merge_dataset.loc[merge_dataset['title_seq_max'] == 1,'title_seq'] = 0
logger.debug(f"\n {merge_dataset[merge_dataset['Author'] == 'Chris Kuzneski']}")
DEBUG:root:
    index                                            Title          Author  \
0      0              The Prisoner's Gold (The Hunters 3)  Chris Kuzneski   
1    390                               The Forbidden Tomb  Chris Kuzneski   
2   4592   The Death Relic (Jonathon Payne & David Jones)  Chris Kuzneski   
3   4636  The Secret Crown (Jonathon Payne & David Jones)  Chris Kuzneski   

                   Edition             Reviews             Ratings  \
0  Paperback,– 10 Mar 2016  4.0 out of 5 stars  8 customer reviews   
1  Paperback,– 28 May 2014  4.1 out of 5 stars  7 customer reviews   
2  Paperback,– 29 Sep 2011  3.2 out of 5 stars  3 customer reviews   
3  Paperback,– 30 Sep 2010  4.6 out of 5 stars  3 customer reviews   

                                                                                              Synopsis  \
0  THE HUNTERS return in their third brilliant novel from the Sunday Times Top Ten bestselling auth...   
1  The Hunters if you seek the y will find. The treasure, for over two thousand years the legendary...   
2  The New World, 1545...\nVanquished by the Spanish Empire, little remains of the Aztec and Mayan ...   
3  Bavaria, 1886\nKing Ludwig II, infamous for his eccentric behavior, is declared insane by his go...   

                               Genre               BookCategory   Price  \
0         Action & Adventure (Books)         Action & Adventure 220.000   
1  Crime, Thriller & Mystery (Books)  Crime, Thriller & Mystery 329.000   
2         Action & Adventure (Books)         Action & Adventure 110.000   
3         Action & Adventure (Books)         Action & Adventure 249.000   

   LogPrice LogPriceBucket  title_len   Author_clean    top_Author  \
0     2.342       Very Low      6.000  chriskuzneski  other_Author   
1     2.517            Med      3.000  chriskuzneski  other_Author   
2     2.041       Very Low      8.000  chriskuzneski  other_Author   
3     2.396            Low      8.000  chriskuzneski  other_Author   

   Author_Title_count  Author_LogPrice_amin  Author_LogPrice_mean  \
0               4.000                 2.041                 2.324   
1               4.000                 2.041                 2.324   
2               4.000                 2.041                 2.324   
3               4.000                 2.041                 2.324   

   Author_LogPrice_amax  Author_LogPrice_std       bind        sour  month  \
0                 2.517                0.202  Paperback  Other_sour  3.000   
1                 2.517                0.202  Paperback  Other_sour  5.000   
2                 2.517                0.202  Paperback  Other_sour  9.000   
3                 2.517                0.202  Paperback  Other_sour  9.000   

      year   top_bind  bind_Title_count  bind_LogPrice_amin  \
0 2016.000  Paperback          5197.000               1.398   
1 2014.000  Paperback          5197.000               1.398   
2 2011.000  Paperback          5197.000               1.398   
3 2010.000  Paperback          5197.000               1.398   

   bind_LogPrice_mean  bind_LogPrice_amax  bind_LogPrice_std    top_sour  \
0               2.569               3.999              0.312  Other_sour   
1               2.569               3.999              0.312  Other_sour   
2               2.569               3.999              0.312  Other_sour   
3               2.569               3.999              0.312  Other_sour   

   sour_Title_count  sour_LogPrice_amin  sour_LogPrice_mean  \
0          5451.000               1.398               2.591   
1          5451.000               1.398               2.591   
2          5451.000               1.398               2.591   
3          5451.000               1.398               2.591   

   sour_LogPrice_amax  sour_LogPrice_std  review_num  rating_num  \
0               4.149              0.326       4.000       1.255   
1               4.149              0.326       4.100       1.230   
2               4.149              0.326       3.200       1.114   
3               4.149              0.326       4.600       1.114   

                           top_Genre  Genre_Title_count  Genre_LogPrice_amin  \
0         Action & Adventure (Books)            947.000                1.699   
1  Crime, Thriller & Mystery (Books)            276.000                1.833   
2         Action & Adventure (Books)            947.000                1.699   
3         Action & Adventure (Books)            947.000                1.699   

   Genre_LogPrice_mean  Genre_LogPrice_amax  Genre_LogPrice_std  \
0                2.521                3.748               0.242   
1                2.424                3.265               0.232   
2                2.521                3.748               0.242   
3                2.521                3.748               0.242   

   BookCategory_Title_count  BookCategory_LogPrice_amin  \
0                   818.000                       1.699   
1                   723.000                       1.833   
2                   818.000                       1.699   
3                   818.000                       1.699   

   BookCategory_LogPrice_mean  BookCategory_LogPrice_amax  \
0                       2.530                       3.748   
1                       2.456                       3.438   
2                       2.530                       3.748   
3                       2.530                       3.748   

   BookCategory_LogPrice_std  synopsis_len  year_month  title_seq  \
0                      0.250       131.000  201603.000      1.000   
1                      0.226       169.000  201405.000      0.750   
2                      0.250       122.000  201109.000      0.500   
3                      0.250       145.000  201009.000      0.250   

   title_seq_max  
0          4.000  
1          4.000  
2          4.000  
3          4.000  
In [83]:
merge_dataset = merge_dataset.\
sort_values(by =['Author','year','month']).\
reset_index(drop=True)
In [84]:
condition = ((merge_dataset['Price'] != -1) &\
             (merge_dataset['top_Author'] != 'other_Author'))
In [212]:
fig = px.scatter(merge_dataset[condition],\
             x='year_month',\
             y='Price',\
             color='top_Author',\
             log_y=True,
              range_x=(200000,202100))
fig.show()

Adding lag in reviews to capture last 3 reviews by the author

In [86]:
merge_dataset['review_num_lag1'] = merge_dataset['review_num'].shift(periods=1,fill_value=0)
merge_dataset['review_num_lag2'] = merge_dataset['review_num'].shift(periods=2,fill_value=0)
merge_dataset['review_num_lag3'] = merge_dataset['review_num'].shift(periods=3,fill_value=0)
In [87]:
for i in range(merge_dataset.shape[0]):
  if i > 3:
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-1,'Author']:
      merge_dataset.loc[i,'review_num_lag1'] = 0
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-2,'Author']:
      merge_dataset.loc[i,'review_num_lag2'] = 0
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-3,'Author']:
      merge_dataset.loc[i,'review_num_lag3'] = 0
In [88]:
logger.debug(f"\n logging few to demonstrate above \n {merge_dataset[merge_dataset['Author'] == 'Satyajit Ray'].\
sort_values('title_seq')}")
DEBUG:root:
 logging few to demonstrate above 
       index                                      Title        Author  \
6474   5760                             Childhood Days  Satyajit Ray   
6475   3638   Killer in Kailash (Adventures of Feluda)  Satyajit Ray   
6476   4454  Trouble in Gangtok (Adventures of Feluda)  Satyajit Ray   
6477   1009                          Speaking of Films  Satyajit Ray   
6478   2946                          Speaking of Films  Satyajit Ray   
6479   4253          Deep Focus: Reflections on Cinema  Satyajit Ray   
6480   1414   The Complete Adventures of Feluda Vol. 2  Satyajit Ray   
6481   3488   The Complete Adventures of Feluda Vol. 1  Satyajit Ray   
6482   4787   The Complete Adventures of Feluda Vol. 2  Satyajit Ray   
6483    326   The Complete Adventures of Feluda Vol. 1  Satyajit Ray   
6484   4044             The Pather Panchali Sketchbook  Satyajit Ray   

                      Edition             Reviews               Ratings  \
6474  Paperback,– 14 Oct 2000  5.0 out of 5 stars     1 customer review   
6475  Paperback,– 24 Sep 2003  4.5 out of 5 stars    5 customer reviews   
6476  Paperback,– 24 Sep 2003  5.0 out of 5 stars    7 customer reviews   
6477  Paperback,– 29 Jul 2005  4.4 out of 5 stars   30 customer reviews   
6478  Paperback,– 29 Jul 2005  4.4 out of 5 stars   30 customer reviews   
6479   Paperback,– 6 Dec 2013  4.5 out of 5 stars   40 customer reviews   
6480  Paperback,– 30 Jun 2015  4.6 out of 5 stars   77 customer reviews   
6481  Paperback,– 30 Jun 2015  4.7 out of 5 stars  104 customer reviews   
6482  Paperback,– 30 Jun 2015  4.6 out of 5 stars   77 customer reviews   
6483  Paperback,– 30 Jun 2015  4.7 out of 5 stars  104 customer reviews   
6484  Paperback,– 23 May 2016  4.9 out of 5 stars   25 customer reviews   

                                                                                                 Synopsis  \
6474  Childhood Days is a biography of Satyajit Ray, which he wrote on his own. Through this book, rea...   
6475  A stolen yakshi head. A plane crash. A vandal on the loose in Ellora an American buys a yakshi h...   
6476  Death in the mountains. An estranged son. A practitioner of the occult Feluda and Topshe are on ...   
6477  India’s greatest film-maker on the art and craft of films. Exactly fifty years ago, in 1955, the...   
6478  India’s greatest film-maker on the art and craft of films. Exactly fifty years ago, in 1955, the...   
6479  Satyajit Ray is acknowledged to be one of the world's finest film-makers. This book brings toget...   
6480  Nineteen gripping tales of suspense and mystery\nFor readers who enjoyed the adventures of Felud...   
6481  This omnibus edition features the ever-popular adventures of Satyajit Ray's enduring creation, t...   
6482  Nineteen gripping tales of suspense and mystery\nFor readers who enjoyed the adventures of Felud...   
6483  This omnibus edition features the ever-popular adventures of Satyajit Ray's enduring creation, t...   
6484  Pather Panchali placed Indian cinema on the world map in 1955: this is a simple statement of fac...   

                                  Genre                          BookCategory  \
6474             Indian Writing (Books)              Arts, Film & Photography   
6475  Crime, Thriller & Mystery (Books)             Crime, Thriller & Mystery   
6476  Crime, Thriller & Mystery (Books)             Crime, Thriller & Mystery   
6477         Cinema & Broadcast (Books)              Arts, Film & Photography   
6478         Cinema & Broadcast (Books)  Biographies, Diaries & True Accounts   
6479         Cinema & Broadcast (Books)              Arts, Film & Photography   
6480         Action & Adventure (Books)                    Action & Adventure   
6481         Action & Adventure (Books)                    Action & Adventure   
6482         Action & Adventure (Books)                    Action & Adventure   
6483         Action & Adventure (Books)                    Action & Adventure   
6484       Theatre & Spectacles (Books)              Arts, Film & Photography   

       Price  LogPrice LogPriceBucket  title_len Author_clean   top_Author  \
6474 205.000     2.312       Very Low      2.000  satyajitray  satyajitray   
6475 135.000     2.130       Very Low      6.000  satyajitray  satyajitray   
6476 135.000     2.130       Very Low      6.000  satyajitray  satyajitray   
6477 252.000     2.401            Low      3.000  satyajitray  satyajitray   
6478 252.000     2.401            Low      3.000  satyajitray  satyajitray   
6479 265.000     2.423            Low      5.000  satyajitray  satyajitray   
6480 299.000     2.476            Low      7.000  satyajitray  satyajitray   
6481 427.000     2.630            Med      7.000  satyajitray  satyajitray   
6482 304.000     2.483            Low      7.000  satyajitray  satyajitray   
6483  -1.000    -1.000         Target      7.000  satyajitray  satyajitray   
6484 779.000     2.892      Very High      4.000  satyajitray  satyajitray   

      Author_Title_count  Author_LogPrice_amin  Author_LogPrice_mean  \
6474              10.000                 2.130                 2.428   
6475              10.000                 2.130                 2.428   
6476              10.000                 2.130                 2.428   
6477              10.000                 2.130                 2.428   
6478              10.000                 2.130                 2.428   
6479              10.000                 2.130                 2.428   
6480              10.000                 2.130                 2.428   
6481              10.000                 2.130                 2.428   
6482              10.000                 2.130                 2.428   
6483              10.000                 2.130                 2.428   
6484              10.000                 2.130                 2.428   

      Author_LogPrice_amax  Author_LogPrice_std       bind        sour  month  \
6474                 2.892                0.224  Paperback  Other_sour 10.000   
6475                 2.892                0.224  Paperback  Other_sour  9.000   
6476                 2.892                0.224  Paperback  Other_sour  9.000   
6477                 2.892                0.224  Paperback  Other_sour  7.000   
6478                 2.892                0.224  Paperback  Other_sour  7.000   
6479                 2.892                0.224  Paperback  Other_sour 12.000   
6480                 2.892                0.224  Paperback  Other_sour  6.000   
6481                 2.892                0.224  Paperback  Other_sour  6.000   
6482                 2.892                0.224  Paperback  Other_sour  6.000   
6483                 2.892                0.224  Paperback  Other_sour  6.000   
6484                 2.892                0.224  Paperback  Other_sour  5.000   

         year   top_bind  bind_Title_count  bind_LogPrice_amin  \
6474 2000.000  Paperback          5197.000               1.398   
6475 2003.000  Paperback          5197.000               1.398   
6476 2003.000  Paperback          5197.000               1.398   
6477 2005.000  Paperback          5197.000               1.398   
6478 2005.000  Paperback          5197.000               1.398   
6479 2013.000  Paperback          5197.000               1.398   
6480 2015.000  Paperback          5197.000               1.398   
6481 2015.000  Paperback          5197.000               1.398   
6482 2015.000  Paperback          5197.000               1.398   
6483 2015.000  Paperback          5197.000               1.398   
6484 2016.000  Paperback          5197.000               1.398   

      bind_LogPrice_mean  bind_LogPrice_amax  bind_LogPrice_std    top_sour  \
6474               2.569               3.999              0.312  Other_sour   
6475               2.569               3.999              0.312  Other_sour   
6476               2.569               3.999              0.312  Other_sour   
6477               2.569               3.999              0.312  Other_sour   
6478               2.569               3.999              0.312  Other_sour   
6479               2.569               3.999              0.312  Other_sour   
6480               2.569               3.999              0.312  Other_sour   
6481               2.569               3.999              0.312  Other_sour   
6482               2.569               3.999              0.312  Other_sour   
6483               2.569               3.999              0.312  Other_sour   
6484               2.569               3.999              0.312  Other_sour   

      sour_Title_count  sour_LogPrice_amin  sour_LogPrice_mean  \
6474          5451.000               1.398               2.591   
6475          5451.000               1.398               2.591   
6476          5451.000               1.398               2.591   
6477          5451.000               1.398               2.591   
6478          5451.000               1.398               2.591   
6479          5451.000               1.398               2.591   
6480          5451.000               1.398               2.591   
6481          5451.000               1.398               2.591   
6482          5451.000               1.398               2.591   
6483          5451.000               1.398               2.591   
6484          5451.000               1.398               2.591   

      sour_LogPrice_amax  sour_LogPrice_std  review_num  rating_num  \
6474               4.149              0.326       5.000       1.041   
6475               4.149              0.326       4.500       1.176   
6476               4.149              0.326       5.000       1.230   
6477               4.149              0.326       4.400       1.602   
6478               4.149              0.326       4.400       1.602   
6479               4.149              0.326       4.500       1.699   
6480               4.149              0.326       4.600       1.940   
6481               4.149              0.326       4.700       2.057   
6482               4.149              0.326       4.600       1.940   
6483               4.149              0.326       4.700       2.057   
6484               4.149              0.326       4.900       1.544   

                              top_Genre  Genre_Title_count  \
6474                        other_Genre             70.000   
6475  Crime, Thriller & Mystery (Books)            276.000   
6476  Crime, Thriller & Mystery (Books)            276.000   
6477                        other_Genre             78.000   
6478                        other_Genre             78.000   
6479                        other_Genre             78.000   
6480         Action & Adventure (Books)            947.000   
6481         Action & Adventure (Books)            947.000   
6482         Action & Adventure (Books)            947.000   
6483         Action & Adventure (Books)            947.000   
6484                        other_Genre             36.000   

      Genre_LogPrice_amin  Genre_LogPrice_mean  Genre_LogPrice_amax  \
6474                1.903                2.249                3.267   
6475                1.833                2.424                3.265   
6476                1.833                2.424                3.265   
6477                2.265                2.747                3.450   
6478                2.265                2.747                3.450   
6479                2.265                2.747                3.450   
6480                1.699                2.521                3.748   
6481                1.699                2.521                3.748   
6482                1.699                2.521                3.748   
6483                1.699                2.521                3.748   
6484                1.833                2.766                3.322   

      Genre_LogPrice_std  BookCategory_Title_count  \
6474               0.240                   517.000   
6475               0.232                   723.000   
6476               0.232                   723.000   
6477               0.315                   517.000   
6478               0.315                   596.000   
6479               0.315                   517.000   
6480               0.242                   818.000   
6481               0.242                   818.000   
6482               0.242                   818.000   
6483               0.242                   818.000   
6484               0.372                   517.000   

      BookCategory_LogPrice_amin  BookCategory_LogPrice_mean  \
6474                       1.398                       2.773   
6475                       1.833                       2.456   
6476                       1.833                       2.456   
6477                       1.398                       2.773   
6478                       1.398                       2.478   
6479                       1.398                       2.773   
6480                       1.699                       2.530   
6481                       1.699                       2.530   
6482                       1.699                       2.530   
6483                       1.699                       2.530   
6484                       1.398                       2.773   

      BookCategory_LogPrice_amax  BookCategory_LogPrice_std  synopsis_len  \
6474                       4.069                      0.358       255.000   
6475                       3.438                      0.226        89.000   
6476                       3.438                      0.226        74.000   
6477                       4.069                      0.358       298.000   
6478                       3.438                      0.240       298.000   
6479                       4.069                      0.358       115.000   
6480                       3.748                      0.250        68.000   
6481                       3.748                      0.250        59.000   
6482                       3.748                      0.250        68.000   
6483                       3.748                      0.250        59.000   
6484                       4.069                      0.358       180.000   

      year_month  title_seq  title_seq_max  review_num_lag1  review_num_lag2  \
6474  200010.000      0.091         11.000            0.000            0.000   
6475  200309.000      0.227         11.000            5.000            0.000   
6476  200309.000      0.227         11.000            4.500            5.000   
6477  200507.000      0.409         11.000            5.000            4.500   
6478  200507.000      0.409         11.000            4.400            5.000   
6479  201312.000      0.545         11.000            4.400            4.400   
6480  201506.000      0.773         11.000            4.500            4.400   
6481  201506.000      0.773         11.000            4.600            4.500   
6482  201506.000      0.773         11.000            4.700            4.600   
6483  201506.000      0.773         11.000            4.600            4.700   
6484  201605.000      1.000         11.000            4.700            4.600   

      review_num_lag3  
6474            0.000  
6475            0.000  
6476            0.000  
6477            5.000  
6478            4.500  
6479            5.000  
6480            4.400  
6481            4.400  
6482            4.500  
6483            4.600  
6484            4.700  

Adding lag in ratings to capture last 3 reviews by the author

In [89]:
merge_dataset['rating_num_lag1'] = merge_dataset['rating_num'].shift(periods=1,fill_value=0)
merge_dataset['rating_num_lag2'] = merge_dataset['rating_num'].shift(periods=2,fill_value=0)
merge_dataset['rating_num_lag3'] = merge_dataset['rating_num'].shift(periods=3,fill_value=0)
In [90]:
for i in range(merge_dataset.shape[0]):
  if i > 3:
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-1,'Author']:
      merge_dataset.loc[i,'rating_num_lag1'] = 0
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-2,'Author']:
      merge_dataset.loc[i,'rating_num_lag2'] = 0
    if merge_dataset.loc[i,'Author'] != merge_dataset.loc[i-3,'Author']:
      merge_dataset.loc[i,'rating_num_lag3'] = 0
In [91]:
logger.debug(f"\n logging few to demonstrate above \n {merge_dataset[merge_dataset['Author'] == 'Satyajit Ray'].\
sort_values('title_seq')}")
DEBUG:root:
 logging few to demonstrate above 
       index                                      Title        Author  \
6474   5760                             Childhood Days  Satyajit Ray   
6475   3638   Killer in Kailash (Adventures of Feluda)  Satyajit Ray   
6476   4454  Trouble in Gangtok (Adventures of Feluda)  Satyajit Ray   
6477   1009                          Speaking of Films  Satyajit Ray   
6478   2946                          Speaking of Films  Satyajit Ray   
6479   4253          Deep Focus: Reflections on Cinema  Satyajit Ray   
6480   1414   The Complete Adventures of Feluda Vol. 2  Satyajit Ray   
6481   3488   The Complete Adventures of Feluda Vol. 1  Satyajit Ray   
6482   4787   The Complete Adventures of Feluda Vol. 2  Satyajit Ray   
6483    326   The Complete Adventures of Feluda Vol. 1  Satyajit Ray   
6484   4044             The Pather Panchali Sketchbook  Satyajit Ray   

                      Edition             Reviews               Ratings  \
6474  Paperback,– 14 Oct 2000  5.0 out of 5 stars     1 customer review   
6475  Paperback,– 24 Sep 2003  4.5 out of 5 stars    5 customer reviews   
6476  Paperback,– 24 Sep 2003  5.0 out of 5 stars    7 customer reviews   
6477  Paperback,– 29 Jul 2005  4.4 out of 5 stars   30 customer reviews   
6478  Paperback,– 29 Jul 2005  4.4 out of 5 stars   30 customer reviews   
6479   Paperback,– 6 Dec 2013  4.5 out of 5 stars   40 customer reviews   
6480  Paperback,– 30 Jun 2015  4.6 out of 5 stars   77 customer reviews   
6481  Paperback,– 30 Jun 2015  4.7 out of 5 stars  104 customer reviews   
6482  Paperback,– 30 Jun 2015  4.6 out of 5 stars   77 customer reviews   
6483  Paperback,– 30 Jun 2015  4.7 out of 5 stars  104 customer reviews   
6484  Paperback,– 23 May 2016  4.9 out of 5 stars   25 customer reviews   

                                                                                                 Synopsis  \
6474  Childhood Days is a biography of Satyajit Ray, which he wrote on his own. Through this book, rea...   
6475  A stolen yakshi head. A plane crash. A vandal on the loose in Ellora an American buys a yakshi h...   
6476  Death in the mountains. An estranged son. A practitioner of the occult Feluda and Topshe are on ...   
6477  India’s greatest film-maker on the art and craft of films. Exactly fifty years ago, in 1955, the...   
6478  India’s greatest film-maker on the art and craft of films. Exactly fifty years ago, in 1955, the...   
6479  Satyajit Ray is acknowledged to be one of the world's finest film-makers. This book brings toget...   
6480  Nineteen gripping tales of suspense and mystery\nFor readers who enjoyed the adventures of Felud...   
6481  This omnibus edition features the ever-popular adventures of Satyajit Ray's enduring creation, t...   
6482  Nineteen gripping tales of suspense and mystery\nFor readers who enjoyed the adventures of Felud...   
6483  This omnibus edition features the ever-popular adventures of Satyajit Ray's enduring creation, t...   
6484  Pather Panchali placed Indian cinema on the world map in 1955: this is a simple statement of fac...   

                                  Genre                          BookCategory  \
6474             Indian Writing (Books)              Arts, Film & Photography   
6475  Crime, Thriller & Mystery (Books)             Crime, Thriller & Mystery   
6476  Crime, Thriller & Mystery (Books)             Crime, Thriller & Mystery   
6477         Cinema & Broadcast (Books)              Arts, Film & Photography   
6478         Cinema & Broadcast (Books)  Biographies, Diaries & True Accounts   
6479         Cinema & Broadcast (Books)              Arts, Film & Photography   
6480         Action & Adventure (Books)                    Action & Adventure   
6481         Action & Adventure (Books)                    Action & Adventure   
6482         Action & Adventure (Books)                    Action & Adventure   
6483         Action & Adventure (Books)                    Action & Adventure   
6484       Theatre & Spectacles (Books)              Arts, Film & Photography   

       Price  LogPrice LogPriceBucket  title_len Author_clean   top_Author  \
6474 205.000     2.312       Very Low      2.000  satyajitray  satyajitray   
6475 135.000     2.130       Very Low      6.000  satyajitray  satyajitray   
6476 135.000     2.130       Very Low      6.000  satyajitray  satyajitray   
6477 252.000     2.401            Low      3.000  satyajitray  satyajitray   
6478 252.000     2.401            Low      3.000  satyajitray  satyajitray   
6479 265.000     2.423            Low      5.000  satyajitray  satyajitray   
6480 299.000     2.476            Low      7.000  satyajitray  satyajitray   
6481 427.000     2.630            Med      7.000  satyajitray  satyajitray   
6482 304.000     2.483            Low      7.000  satyajitray  satyajitray   
6483  -1.000    -1.000         Target      7.000  satyajitray  satyajitray   
6484 779.000     2.892      Very High      4.000  satyajitray  satyajitray   

      Author_Title_count  Author_LogPrice_amin  Author_LogPrice_mean  \
6474              10.000                 2.130                 2.428   
6475              10.000                 2.130                 2.428   
6476              10.000                 2.130                 2.428   
6477              10.000                 2.130                 2.428   
6478              10.000                 2.130                 2.428   
6479              10.000                 2.130                 2.428   
6480              10.000                 2.130                 2.428   
6481              10.000                 2.130                 2.428   
6482              10.000                 2.130                 2.428   
6483              10.000                 2.130                 2.428   
6484              10.000                 2.130                 2.428   

      Author_LogPrice_amax  Author_LogPrice_std       bind        sour  month  \
6474                 2.892                0.224  Paperback  Other_sour 10.000   
6475                 2.892                0.224  Paperback  Other_sour  9.000   
6476                 2.892                0.224  Paperback  Other_sour  9.000   
6477                 2.892                0.224  Paperback  Other_sour  7.000   
6478                 2.892                0.224  Paperback  Other_sour  7.000   
6479                 2.892                0.224  Paperback  Other_sour 12.000   
6480                 2.892                0.224  Paperback  Other_sour  6.000   
6481                 2.892                0.224  Paperback  Other_sour  6.000   
6482                 2.892                0.224  Paperback  Other_sour  6.000   
6483                 2.892                0.224  Paperback  Other_sour  6.000   
6484                 2.892                0.224  Paperback  Other_sour  5.000   

         year   top_bind  bind_Title_count  bind_LogPrice_amin  \
6474 2000.000  Paperback          5197.000               1.398   
6475 2003.000  Paperback          5197.000               1.398   
6476 2003.000  Paperback          5197.000               1.398   
6477 2005.000  Paperback          5197.000               1.398   
6478 2005.000  Paperback          5197.000               1.398   
6479 2013.000  Paperback          5197.000               1.398   
6480 2015.000  Paperback          5197.000               1.398   
6481 2015.000  Paperback          5197.000               1.398   
6482 2015.000  Paperback          5197.000               1.398   
6483 2015.000  Paperback          5197.000               1.398   
6484 2016.000  Paperback          5197.000               1.398   

      bind_LogPrice_mean  bind_LogPrice_amax  bind_LogPrice_std    top_sour  \
6474               2.569               3.999              0.312  Other_sour   
6475               2.569               3.999              0.312  Other_sour   
6476               2.569               3.999              0.312  Other_sour   
6477               2.569               3.999              0.312  Other_sour   
6478               2.569               3.999              0.312  Other_sour   
6479               2.569               3.999              0.312  Other_sour   
6480               2.569               3.999              0.312  Other_sour   
6481               2.569               3.999              0.312  Other_sour   
6482               2.569               3.999              0.312  Other_sour   
6483               2.569               3.999              0.312  Other_sour   
6484               2.569               3.999              0.312  Other_sour   

      sour_Title_count  sour_LogPrice_amin  sour_LogPrice_mean  \
6474          5451.000               1.398               2.591   
6475          5451.000               1.398               2.591   
6476          5451.000               1.398               2.591   
6477          5451.000               1.398               2.591   
6478          5451.000               1.398               2.591   
6479          5451.000               1.398               2.591   
6480          5451.000               1.398               2.591   
6481          5451.000               1.398               2.591   
6482          5451.000               1.398               2.591   
6483          5451.000               1.398               2.591   
6484          5451.000               1.398               2.591   

      sour_LogPrice_amax  sour_LogPrice_std  review_num  rating_num  \
6474               4.149              0.326       5.000       1.041   
6475               4.149              0.326       4.500       1.176   
6476               4.149              0.326       5.000       1.230   
6477               4.149              0.326       4.400       1.602   
6478               4.149              0.326       4.400       1.602   
6479               4.149              0.326       4.500       1.699   
6480               4.149              0.326       4.600       1.940   
6481               4.149              0.326       4.700       2.057   
6482               4.149              0.326       4.600       1.940   
6483               4.149              0.326       4.700       2.057   
6484               4.149              0.326       4.900       1.544   

                              top_Genre  Genre_Title_count  \
6474                        other_Genre             70.000   
6475  Crime, Thriller & Mystery (Books)            276.000   
6476  Crime, Thriller & Mystery (Books)            276.000   
6477                        other_Genre             78.000   
6478                        other_Genre             78.000   
6479                        other_Genre             78.000   
6480         Action & Adventure (Books)            947.000   
6481         Action & Adventure (Books)            947.000   
6482         Action & Adventure (Books)            947.000   
6483         Action & Adventure (Books)            947.000   
6484                        other_Genre             36.000   

      Genre_LogPrice_amin  Genre_LogPrice_mean  Genre_LogPrice_amax  \
6474                1.903                2.249                3.267   
6475                1.833                2.424                3.265   
6476                1.833                2.424                3.265   
6477                2.265                2.747                3.450   
6478                2.265                2.747                3.450   
6479                2.265                2.747                3.450   
6480                1.699                2.521                3.748   
6481                1.699                2.521                3.748   
6482                1.699                2.521                3.748   
6483                1.699                2.521                3.748   
6484                1.833                2.766                3.322   

      Genre_LogPrice_std  BookCategory_Title_count  \
6474               0.240                   517.000   
6475               0.232                   723.000   
6476               0.232                   723.000   
6477               0.315                   517.000   
6478               0.315                   596.000   
6479               0.315                   517.000   
6480               0.242                   818.000   
6481               0.242                   818.000   
6482               0.242                   818.000   
6483               0.242                   818.000   
6484               0.372                   517.000   

      BookCategory_LogPrice_amin  BookCategory_LogPrice_mean  \
6474                       1.398                       2.773   
6475                       1.833                       2.456   
6476                       1.833                       2.456   
6477                       1.398                       2.773   
6478                       1.398                       2.478   
6479                       1.398                       2.773   
6480                       1.699                       2.530   
6481                       1.699                       2.530   
6482                       1.699                       2.530   
6483                       1.699                       2.530   
6484                       1.398                       2.773   

      BookCategory_LogPrice_amax  BookCategory_LogPrice_std  synopsis_len  \
6474                       4.069                      0.358       255.000   
6475                       3.438                      0.226        89.000   
6476                       3.438                      0.226        74.000   
6477                       4.069                      0.358       298.000   
6478                       3.438                      0.240       298.000   
6479                       4.069                      0.358       115.000   
6480                       3.748                      0.250        68.000   
6481                       3.748                      0.250        59.000   
6482                       3.748                      0.250        68.000   
6483                       3.748                      0.250        59.000   
6484                       4.069                      0.358       180.000   

      year_month  title_seq  title_seq_max  review_num_lag1  review_num_lag2  \
6474  200010.000      0.091         11.000            0.000            0.000   
6475  200309.000      0.227         11.000            5.000            0.000   
6476  200309.000      0.227         11.000            4.500            5.000   
6477  200507.000      0.409         11.000            5.000            4.500   
6478  200507.000      0.409         11.000            4.400            5.000   
6479  201312.000      0.545         11.000            4.400            4.400   
6480  201506.000      0.773         11.000            4.500            4.400   
6481  201506.000      0.773         11.000            4.600            4.500   
6482  201506.000      0.773         11.000            4.700            4.600   
6483  201506.000      0.773         11.000            4.600            4.700   
6484  201605.000      1.000         11.000            4.700            4.600   

      review_num_lag3  rating_num_lag1  rating_num_lag2  rating_num_lag3  
6474            0.000            0.000            0.000            0.000  
6475            0.000            1.041            0.000            0.000  
6476            0.000            1.176            1.041            0.000  
6477            5.000            1.230            1.176            1.041  
6478            4.500            1.602            1.230            1.176  
6479            5.000            1.602            1.602            1.230  
6480            4.400            1.699            1.602            1.602  
6481            4.400            1.940            1.699            1.602  
6482            4.500            2.057            1.940            1.699  
6483            4.600            1.940            2.057            1.940  
6484            4.700            2.057            1.940            2.057  
In [92]:
train_data = merge_dataset[merge_dataset['LogPrice'] != -1]
test_data = merge_dataset[merge_dataset['LogPrice'] == -1]
logger.debug(f'Shape of test_dataset is {train_data.shape}')
logger.debug(f'Shape of test_dataset is {test_data.shape}')
DEBUG:root:Shape of test_dataset is (6237, 59)
DEBUG:root:Shape of test_dataset is (1560, 59)
In [93]:
#Rearranging test_data orders
test_data = test_data.sort_values(by='index').reset_index(drop=True)
In [94]:
logger.debug(f"Top 5 values {test_data.head()}")
DEBUG:root:Top 5 values    index  \
0      0   
1      1   
2      2   
3      3   
4      4   

                                                                                                 Title  \
0                                                           The Complete Sherlock Holmes: 2 Boxes sets   
1  Learn Docker - Fundamentals of Docker 18.x: Everything you need to know about containerizing you...   
2                                                                                             Big Girl   
3                                                 Think Python: How to Think Like a Computer Scientist   
4                          Oxford Word Skills: Advanced - Idioms & Phrasal Verbs Student Book with Key   

                   Author                             Edition  \
0  Sir Arthur Conan Doyle  Mass Market Paperback,– 1 Oct 1986   
1     Gabriel N. Schenker     Paperback,– Import, 26 Apr 2018   
2          Danielle Steel             Paperback,– 17 Mar 2011   
3         Allen B. Downey                    Paperback,– 2016   
4           Redman Gairns             Paperback,– 26 Dec 2011   

              Reviews               Ratings  \
0  4.4 out of 5 stars  960 customer reviews   
1  5.0 out of 5 stars     1 customer review   
2  5.0 out of 5 stars    4 customer reviews   
3  4.1 out of 5 stars   11 customer reviews   
4  4.4 out of 5 stars    9 customer reviews   

                                                                                              Synopsis  \
0  A collection of entire body of work of the Sherlock Holmes Series by Arthur Conan Doyle, 'The Co...   
1  Enhance your software deployment workflow using containers Key Features Get up-and-running with ...   
2  'Watch out, world. Here I come!'\nFor Victoria Dawson, growing up isn't a happy experience. Born...   
3  If you want to learn how to program, working with Python is an excellent way to start. This hand...   
4  Learn and practise the verbs, prepositions and idioms you need to speak and write naturally in E...   

                                        Genre  \
0                       Short Stories (Books)   
1                 Operating Systems Textbooks   
2                             Romance (Books)   
3  Programming & Software Development (Books)   
4                         Linguistics (Books)   

                          BookCategory  Price  LogPrice LogPriceBucket  \
0            Crime, Thriller & Mystery -1.000    -1.000         Target   
1  Computing, Internet & Digital Media -1.000    -1.000         Target   
2                              Romance -1.000    -1.000         Target   
3  Computing, Internet & Digital Media -1.000    -1.000         Target   
4      Language, Linguistics & Writing -1.000    -1.000         Target   

   title_len         Author_clean     top_Author  Author_Title_count  \
0      7.000  sirarthurconandoyle   other_Author               6.000   
1     21.000     gabrielnschenker   other_Author               0.000   
2      2.000        daniellesteel  daniellesteel              18.000   
3      9.000         allenbdowney   other_Author               0.000   
4     13.000         redmangairns   other_Author               3.000   

   Author_LogPrice_amin  Author_LogPrice_mean  Author_LogPrice_amax  \
0                 2.173                 2.438                 2.840   
1                 1.898                 2.833                 3.764   
2                 2.097                 2.363                 2.594   
3                 1.898                 2.833                 3.764   
4                 2.594                 2.672                 2.760   

   Author_LogPrice_std                   bind        sour  month     year  \
0                0.272  Mass Market Paperback  Other_sour 10.000 1986.000   
1                0.000              Paperback      Import  4.000 2018.000   
2                0.125              Paperback  Other_sour  3.000 2011.000   
3                0.000              Paperback  Other_sour  0.000 2016.000   
4                0.083              Paperback  Other_sour 12.000 2011.000   

                top_bind  bind_Title_count  bind_LogPrice_amin  \
0  Mass Market Paperback           155.000               1.568   
1              Paperback          5197.000               1.398   
2              Paperback          5197.000               1.398   
3              Paperback          5197.000               1.398   
4              Paperback          5197.000               1.398   

   bind_LogPrice_mean  bind_LogPrice_amax  bind_LogPrice_std    top_sour  \
0               2.481               3.283              0.224  Other_sour   
1               2.569               3.999              0.312      Import   
2               2.569               3.999              0.312  Other_sour   
3               2.569               3.999              0.312  Other_sour   
4               2.569               3.999              0.312  Other_sour   

   sour_Title_count  sour_LogPrice_amin  sour_LogPrice_mean  \
0          5451.000               1.398               2.591   
1           616.000               1.602               2.665   
2          5451.000               1.398               2.591   
3          5451.000               1.398               2.591   
4          5451.000               1.398               2.591   

   sour_LogPrice_amax  sour_LogPrice_std  review_num  rating_num  \
0               4.149              0.326       4.400       2.987   
1               4.122              0.365       5.000       1.041   
2               4.149              0.326       5.000       1.146   
3               4.149              0.326       4.100       1.322   
4               4.149              0.326       4.400       1.279   

         top_Genre  Genre_Title_count  Genre_LogPrice_amin  \
0      other_Genre             31.000                1.568   
1      other_Genre              5.000                2.447   
2  Romance (Books)            419.000                1.699   
3      other_Genre             32.000                2.423   
4      other_Genre             57.000                1.763   

   Genre_LogPrice_mean  Genre_LogPrice_amax  Genre_LogPrice_std  \
0                2.513                2.952               0.271   
1                3.060                3.408               0.361   
2                2.422                3.999               0.272   
3                2.829                3.345               0.224   
4                2.589                3.308               0.292   

   BookCategory_Title_count  BookCategory_LogPrice_amin  \
0                   723.000                       1.833   
1                   510.000                       1.898   
2                   560.000                       1.568   
3                   510.000                       1.898   
4                   594.000                       1.477   

   BookCategory_LogPrice_mean  BookCategory_LogPrice_amax  \
0                       2.456                       3.438   
1                       2.833                       3.764   
2                       2.437                       3.999   
3                       2.833                       3.764   
4                       2.487                       3.714   

   BookCategory_LogPrice_std  synopsis_len  year_month  title_seq  \
0                      0.226       323.000  198610.000      0.158   
1                      0.354       312.000  201804.000      0.000   
2                      0.281       172.000  201103.000      0.240   
3                      0.354       181.000  201600.000      0.000   
4                      0.386        17.000  201112.000      1.000   

   title_seq_max  review_num_lag1  review_num_lag2  review_num_lag3  \
0          9.500            4.400            0.000            0.000   
1          1.000            0.000            0.000            0.000   
2         25.000            5.000            4.500            4.000   
3          1.000            0.000            0.000            0.000   
4          4.000            4.500            4.400            4.500   

   rating_num_lag1  rating_num_lag2  rating_num_lag3  
0            2.986            0.000            0.000  
1            0.000            0.000            0.000  
2            1.230            1.079            1.079  
3            0.000            0.000            0.000  
4            1.544            1.322            1.491  
In [95]:
y = train_data[target]
X = train_data.drop(target,axis=1)
X_tr = X[X['Title'].isin(test_dataset['Title'])]
y_tr = y[X['Title'].isin(test_dataset['Title']).index]


X_train,X_val,y_train,y_val = train_test_split(X,y,random_state=123,test_size=.3)
X_train = pd.concat([X_train,X_tr]).drop_duplicates()
y_train = y[X_train.index]

logger.debug(f'Shape of X_train is {X_train.shape}')
logger.debug(f'Shape of X_val is {X_val.shape}')
logger.debug(f'Shape of Y_train is {y_train.shape}')
logger.debug(f'Shape of Y_val is {y_val.shape}')
DEBUG:root:Shape of X_train is (4463, 58)
DEBUG:root:Shape of X_val is (1872, 58)
DEBUG:root:Shape of Y_train is (4463,)
DEBUG:root:Shape of Y_val is (1872,)
In [96]:
num_cols = num_cols + ['year_month','title_seq', 'title_seq_max',\
                       'review_num_lag1', 'review_num_lag2',\
                       'review_num_lag3', 'rating_num_lag1', 'rating_num_lag2',\
                       'rating_num_lag3']

Categorical variables

In [97]:
mapper = DataFrameMapper([([cat_col],OneHotEncoder()) for cat_col in cat_cols],\
                         df_out=True)
mapper.fit(X_train)
Out[97]:
DataFrameMapper(default=False, df_out=True,
                features=[(['BookCategory'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error', sparse=True)),
                          (['top_Author'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error', sparse=True)),
                          (['top_bind'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error', sparse=True)),
                          (['top_sour'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error', sparse=True)),
                          (['top_Genre'],
                           OneHotEncoder(categories='auto', drop=None,
                                         dtype=<class 'numpy.float64'>,
                                         handle_unknown='error',
                                         sparse=True))],
                input_df=False, sparse=False)
In [98]:
X_train_ohe = mapper.transform(X_train)
X_val_ohe = mapper.transform(X_val)
X_test_ohe = mapper.transform(test_data)
logger.debug(f'Shape of X_train_ohe is {X_train_ohe.shape}')
logger.debug(f'Shape of X_val_ohe is {X_val_ohe.shape}')
logger.debug(f'Shape of X_test_ohe is {X_test_ohe.shape}')
DEBUG:root:Shape of X_train_ohe is (4463, 105)
DEBUG:root:Shape of X_val_ohe is (1872, 105)
DEBUG:root:Shape of X_test_ohe is (1560, 105)

Numerical variables

In [99]:
scl = MinMaxScaler()
scl.fit(X_train[num_cols])
X_train_std = pd.DataFrame(scl.transform(X_train[num_cols]),
                                        columns = num_cols,
                           index = X_train.index)
X_val_std = pd.DataFrame(scl.transform(X_val[num_cols]),
                                       columns=num_cols,
                         index=X_val.index)
test_dataset_std = pd.DataFrame(scl.transform(test_data[num_cols]),
                                              columns=num_cols,
                                index=test_data.index)

logger.debug(f'Shape of X_train is {X_train_std.shape}')
logger.debug(f'Shape of X_val is {X_val_std.shape}')
logger.debug(f'Shape of X_test is {test_dataset_std.shape}')
DEBUG:root:Shape of X_train is (4463, 40)
DEBUG:root:Shape of X_val is (1872, 40)
DEBUG:root:Shape of X_test is (1560, 40)

Text variables

Strategy-TFIDF
In [100]:
encoding_strategy =  'tfidfTitle' 
In [101]:
tfidf = TfidfVectorizer(ngram_range=(1,3),\
                        min_df=20,\
                        stop_words=STOP_WORDS)

X_train_title = tfidf.fit_transform(X_train['Title'])
features = [ 'Title_' + str(i)  for i in tfidf.get_feature_names()]

X_train_title = pd.DataFrame(X_train_title.toarray(),\
                             columns=features,\
                             index = X_train.index)
logger.debug(f'Features are {features}')
logger.debug(f'Shape of X_train is  {X_train_title.shape}')
/usr/local/lib/python3.7/dist-packages/sklearn/feature_extraction/text.py:385: UserWarning:

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.

DEBUG:root:Features are ['Title_10', 'Title_2018', 'Title_adventures', 'Title_album', 'Title_answers', 'Title_art', 'Title_asterix', 'Title_autobiography', 'Title_best', 'Title_big', 'Title_biography', 'Title_black', 'Title_book', 'Title_books', 'Title_business', 'Title_calvin', 'Title_calvin hobbes', 'Title_cambridge', 'Title_cat', 'Title_cd', 'Title_classics', 'Title_collection', 'Title_college', 'Title_comics', 'Title_complete', 'Title_computer', 'Title_course', 'Title_creative', 'Title_cricket', 'Title_dark', 'Title_data', 'Title_day', 'Title_days', 'Title_death', 'Title_design', 'Title_dictionary', 'Title_digital', 'Title_dover', 'Title_easy', 'Title_edition', 'Title_english', 'Title_essential', 'Title_exam', 'Title_fire', 'Title_game', 'Title_games', 'Title_girl', 'Title_good', 'Title_grade', 'Title_grammar', 'Title_graphic', 'Title_graphic novel', 'Title_graphic novels', 'Title_great', 'Title_greatest', 'Title_guide', 'Title_history', 'Title_hobbes', 'Title_house', 'Title_ice', 'Title_ideas', 'Title_illustrated', 'Title_india', 'Title_indian', 'Title_inside', 'Title_international', 'Title_introduction', 'Title_jack', 'Title_journey', 'Title_key', 'Title_know', 'Title_language', 'Title_learn', 'Title_learning', 'Title_lessons', 'Title_level', 'Title_library', 'Title_life', 'Title_little', 'Title_lost', 'Title_love', 'Title_machine', 'Title_magic', 'Title_making', 'Title_man', 'Title_men', 'Title_mind', 'Title_modern', 'Title_modern classics', 'Title_music', 'Title_new', 'Title_night', 'Title_novel', 'Title_novels', 'Title_omnibus', 'Title_oxford', 'Title_penguin', 'Title_penguin classics', 'Title_penguin modern', 'Title_people', 'Title_piano', 'Title_power', 'Title_practice', 'Title_programming', 'Title_python', 'Title_read', 'Title_red', 'Title_reference', 'Title_revised', 'Title_science', 'Title_secret', 'Title_secrets', 'Title_self', 'Title_series', 'Title_set', 'Title_short', 'Title_stories', 'Title_story', 'Title_techniques', 'Title_test', 'Title_theory', 'Title_things', 'Title_thriller', 'Title_time', 'Title_times', 'Title_tintin', 'Title_training', 'Title_trilogy', 'Title_true', 'Title_ultimate', 'Title_vintage', 'Title_vol', 'Title_volume', 'Title_war', 'Title_way', 'Title_women', 'Title_words', 'Title_world', 'Title_years']
DEBUG:root:Shape of X_train is  (4463, 139)
In [102]:
tfidf_vocab = pd.concat([pd.Series(tfidf.get_feature_names(),\
                                   name='features'),\
           pd.Series(tfidf.idf_,\
                     name='idf')],axis=1)
logger.debug(f" Logging omnibus \n{tfidf_vocab[tfidf_vocab['features'] == 'complete']}")
logger.debug(f"\n{tfidf_vocab.sort_values('idf',ascending=False)}")
DEBUG:root: Logging omnibus 
    features   idf
24  complete 5.047
DEBUG:root:
          features   idf
138          years 6.359
45           games 6.359
65   international 6.359
74         lessons 6.359
61     illustrated 6.359
..             ...   ...
113         series 4.592
137          world 4.536
39         edition 4.469
55           guide 4.316
12            book 3.769

[139 rows x 2 columns]
In [103]:
X_val_title = tfidf.transform(X_val['Title'])
X_val_title = pd.DataFrame(X_val_title.toarray(),\
                           columns=features,\
                           index=X_val.index)

test_data_title = tfidf.transform(test_data['Title'])
test_data_title = pd.DataFrame(test_data_title.toarray(),\
                           columns=features,\
                           index=test_data.index)
In [104]:
encoding_strategy =  'tfidfSynopsis' + encoding_strategy
In [105]:
stop_words = set(list(STOP_WORDS)+['book','world','books'])
In [106]:
tfidf = TfidfVectorizer(ngram_range=(1,3),\
                        min_df=250,\
                        stop_words=stop_words)

X_train_synopsis = tfidf.fit_transform(X_train['Synopsis'])
features = [ 'Synopsis_' + str(i)  for i in tfidf.get_feature_names()]

X_train_synopsis = pd.DataFrame(X_train_synopsis.toarray(),\
                             columns=features,\
                             index = X_train.index)
logger.debug(f'Features are {features}')
logger.debug(f'Shape of X_train is  {X_train_synopsis.shape}')

X_val_synopsis = tfidf.transform(X_val['Synopsis'])
X_val_synopsis = pd.DataFrame(X_val_synopsis.toarray(),\
                           columns=features,\
                           index=X_val.index)

test_data_synopsis = tfidf.transform(test_data['Synopsis'])
test_data_synopsis = pd.DataFrame(test_data_synopsis.toarray(),\
                           columns=features,\
                           index=test_data.index)
/usr/local/lib/python3.7/dist-packages/sklearn/feature_extraction/text.py:385: UserWarning:

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ll', 've'] not in stop_words.

DEBUG:root:Features are ['Synopsis_age', 'Synopsis_art', 'Synopsis_author', 'Synopsis_available', 'Synopsis_based', 'Synopsis_beautiful', 'Synopsis_best', 'Synopsis_bestseller', 'Synopsis_bestselling', 'Synopsis_business', 'Synopsis_century', 'Synopsis_classic', 'Synopsis_collection', 'Synopsis_come', 'Synopsis_comes', 'Synopsis_day', 'Synopsis_death', 'Synopsis_different', 'Synopsis_easy', 'Synopsis_edition', 'Synopsis_end', 'Synopsis_english', 'Synopsis_family', 'Synopsis_features', 'Synopsis_find', 'Synopsis_game', 'Synopsis_good', 'Synopsis_great', 'Synopsis_greatest', 'Synopsis_guide', 'Synopsis_heart', 'Synopsis_help', 'Synopsis_high', 'Synopsis_history', 'Synopsis_home', 'Synopsis_human', 'Synopsis_important', 'Synopsis_includes', 'Synopsis_including', 'Synopsis_india', 'Synopsis_indian', 'Synopsis_journey', 'Synopsis_key', 'Synopsis_know', 'Synopsis_known', 'Synopsis_language', 'Synopsis_learn', 'Synopsis_learning', 'Synopsis_life', 'Synopsis_like', 'Synopsis_lives', 'Synopsis_ll', 'Synopsis_long', 'Synopsis_love', 'Synopsis_makes', 'Synopsis_making', 'Synopsis_man', 'Synopsis_modern', 'Synopsis_need', 'Synopsis_new', 'Synopsis_new york', 'Synopsis_new york times', 'Synopsis_novel', 'Synopsis_old', 'Synopsis_past', 'Synopsis_people', 'Synopsis_perfect', 'Synopsis_personal', 'Synopsis_popular', 'Synopsis_power', 'Synopsis_powerful', 'Synopsis_provides', 'Synopsis_published', 'Synopsis_questions', 'Synopsis_read', 'Synopsis_readers', 'Synopsis_reading', 'Synopsis_real', 'Synopsis_second', 'Synopsis_secret', 'Synopsis_series', 'Synopsis_set', 'Synopsis_simple', 'Synopsis_stories', 'Synopsis_story', 'Synopsis_students', 'Synopsis_takes', 'Synopsis_time', 'Synopsis_times', 'Synopsis_today', 'Synopsis_true', 'Synopsis_use', 'Synopsis_war', 'Synopsis_way', 'Synopsis_woman', 'Synopsis_words', 'Synopsis_work', 'Synopsis_writing', 'Synopsis_written', 'Synopsis_year', 'Synopsis_years', 'Synopsis_york', 'Synopsis_york times', 'Synopsis_young']
DEBUG:root:Shape of X_train is  (4463, 104)
In [107]:
tfidf_vocab = pd.concat([pd.Series(tfidf.get_feature_names(),\
                                   name='features'),\
           pd.Series(tfidf.idf_,\
                     name='idf')],axis=1)
logger.debug(f" Logging omnibus \n{tfidf_vocab[tfidf_vocab['features'] == 'edition']}")
logger.debug(f"\n{tfidf_vocab.sort_values('idf',ascending=False)}")
DEBUG:root: Logging omnibus 
   features   idf
19  edition 3.376
DEBUG:root:
      features   idf
5    beautiful 3.874
82      simple 3.866
25        game 3.866
0          age 3.859
95       words 3.855
..         ...   ...
100      years 2.725
84       story 2.615
87        time 2.540
48        life 2.218
59         new 2.152

[104 rows x 2 columns]
Strategy-Glove
In [108]:
encoding_strategy = 'gloveTitle' + encoding_strategy
In [109]:
gv = fe.GloveAutoEncoder()
X_train_title = gv.create_glove_encoding(docs=X_train['Title'],\
                                         col_name='Title',\
                                         max_length=9,\
                                         epochs=200,
                                         learning_rate=.5)

X_val_title = gv.transform_glove_encoding(X_val['Title'])
test_data_title = gv.transform_glove_encoding(test_data['Title'])
There are 6576 words in data
Indexing word vector
Indexed 400000 words
Here is shape of embedding matrix (6577, 100)
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 9, 100)            657700    
_________________________________________________________________
conv1d (Conv1D)              (None, 9, 2)              602       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 3, 2)              0         
_________________________________________________________________
flatten (Flatten)            (None, 6)                 0         
_________________________________________________________________
batch_normalization (BatchNo (None, 6)                 24        
_________________________________________________________________
dense (Dense)                (None, 9)                 63        
=================================================================
Total params: 658,389
Trainable params: 677
Non-trainable params: 657,712
_________________________________________________________________
None
Epoch 1/200
140/140 [==============================] - 2s 4ms/step - loss: 1705210.5000 - val_loss: 1695805.2500
Epoch 2/200
140/140 [==============================] - 0s 3ms/step - loss: 1530186.1250 - val_loss: 1578142.3750
Epoch 3/200
140/140 [==============================] - 0s 3ms/step - loss: 1491964.5000 - val_loss: 1461169.1250
Epoch 4/200
140/140 [==============================] - 0s 3ms/step - loss: 1479442.3750 - val_loss: 1426874.6250
Epoch 5/200
140/140 [==============================] - 0s 3ms/step - loss: 1470109.0000 - val_loss: 1424426.5000
Epoch 6/200
140/140 [==============================] - 0s 3ms/step - loss: 1450911.7500 - val_loss: 1414909.3750
Epoch 7/200
140/140 [==============================] - 0s 3ms/step - loss: 1458703.0000 - val_loss: 1409876.8750
Epoch 8/200
140/140 [==============================] - 0s 3ms/step - loss: 1448543.0000 - val_loss: 1415105.8750
Epoch 9/200
140/140 [==============================] - 0s 3ms/step - loss: 1445677.8750 - val_loss: 1408713.7500
Epoch 10/200
140/140 [==============================] - 0s 3ms/step - loss: 1439947.2500 - val_loss: 1417996.0000
Epoch 11/200
140/140 [==============================] - 0s 3ms/step - loss: 1439670.5000 - val_loss: 1412938.7500
Epoch 12/200
140/140 [==============================] - 0s 3ms/step - loss: 1439932.7500 - val_loss: 1401576.3750
Epoch 13/200
140/140 [==============================] - 0s 3ms/step - loss: 1439701.0000 - val_loss: 1397674.3750
Epoch 14/200
140/140 [==============================] - 0s 3ms/step - loss: 1442442.7500 - val_loss: 1392356.1250
Epoch 15/200
140/140 [==============================] - 0s 3ms/step - loss: 1436001.7500 - val_loss: 1401064.1250
Epoch 16/200
140/140 [==============================] - 0s 3ms/step - loss: 1430856.5000 - val_loss: 1392359.2500
Epoch 17/200
140/140 [==============================] - 0s 3ms/step - loss: 1436722.8750 - val_loss: 1402500.8750
Epoch 18/200
140/140 [==============================] - 0s 3ms/step - loss: 1434826.1250 - val_loss: 1401636.8750
Epoch 19/200
140/140 [==============================] - 0s 3ms/step - loss: 1440673.6250 - val_loss: 1419727.8750
Epoch 20/200
140/140 [==============================] - 0s 3ms/step - loss: 1431067.5000 - val_loss: 1390740.0000
Epoch 21/200
140/140 [==============================] - 0s 3ms/step - loss: 1430442.2500 - val_loss: 1394357.1250
Epoch 22/200
140/140 [==============================] - 0s 3ms/step - loss: 1426664.3750 - val_loss: 1394058.6250
Epoch 23/200
140/140 [==============================] - 0s 3ms/step - loss: 1434010.8750 - val_loss: 1395973.6250
Epoch 24/200
140/140 [==============================] - 0s 3ms/step - loss: 1428559.3750 - val_loss: 1399149.5000
Epoch 25/200
140/140 [==============================] - 0s 3ms/step - loss: 1433121.0000 - val_loss: 1405166.6250
Epoch 26/200
140/140 [==============================] - 0s 3ms/step - loss: 1427397.1250 - val_loss: 1401448.8750
Epoch 27/200
140/140 [==============================] - 0s 3ms/step - loss: 1428840.6250 - val_loss: 1391197.2500
Epoch 28/200
140/140 [==============================] - 0s 3ms/step - loss: 1427012.6250 - val_loss: 1400141.3750
Epoch 29/200
140/140 [==============================] - 0s 3ms/step - loss: 1421986.6250 - val_loss: 1386063.0000
Epoch 30/200
140/140 [==============================] - 0s 3ms/step - loss: 1428953.8750 - val_loss: 1389191.7500
Epoch 31/200
140/140 [==============================] - 0s 3ms/step - loss: 1428568.1250 - val_loss: 1396363.2500
Epoch 32/200
140/140 [==============================] - 0s 3ms/step - loss: 1420260.7500 - val_loss: 1380258.7500
Epoch 33/200
140/140 [==============================] - 0s 3ms/step - loss: 1425278.6250 - val_loss: 1391876.8750
Epoch 34/200
140/140 [==============================] - 0s 3ms/step - loss: 1424156.6250 - val_loss: 1388287.5000
Epoch 35/200
140/140 [==============================] - 0s 3ms/step - loss: 1420729.3750 - val_loss: 1391245.5000
Epoch 36/200
140/140 [==============================] - 0s 3ms/step - loss: 1424842.7500 - val_loss: 1407553.2500
Epoch 37/200
140/140 [==============================] - 0s 3ms/step - loss: 1428529.7500 - val_loss: 1393987.1250
Epoch 38/200
140/140 [==============================] - 0s 3ms/step - loss: 1423170.8750 - val_loss: 1392156.6250
Epoch 39/200
140/140 [==============================] - 0s 3ms/step - loss: 1421934.8750 - val_loss: 1391865.5000
Epoch 40/200
140/140 [==============================] - 0s 3ms/step - loss: 1423879.1250 - val_loss: 1393015.1250
Epoch 41/200
140/140 [==============================] - 0s 3ms/step - loss: 1424652.0000 - val_loss: 1383128.3750
Epoch 42/200
140/140 [==============================] - 0s 3ms/step - loss: 1419136.8750 - val_loss: 1398693.0000
Epoch 43/200
140/140 [==============================] - 0s 3ms/step - loss: 1421266.0000 - val_loss: 1391458.5000
Epoch 44/200
140/140 [==============================] - 0s 3ms/step - loss: 1419633.1250 - val_loss: 1384295.2500
Epoch 45/200
140/140 [==============================] - 0s 3ms/step - loss: 1418902.8750 - val_loss: 1390107.2500
Epoch 46/200
140/140 [==============================] - 0s 3ms/step - loss: 1420556.5000 - val_loss: 1383498.6250
Epoch 47/200
140/140 [==============================] - 0s 3ms/step - loss: 1421123.3750 - val_loss: 1393581.2500
Epoch 48/200
140/140 [==============================] - 0s 3ms/step - loss: 1418570.0000 - val_loss: 1390919.1250
Epoch 49/200
140/140 [==============================] - 0s 3ms/step - loss: 1422632.3750 - val_loss: 1396823.2500
Epoch 50/200
140/140 [==============================] - 0s 3ms/step - loss: 1417133.0000 - val_loss: 1396227.0000
Epoch 51/200
140/140 [==============================] - 0s 3ms/step - loss: 1414529.1250 - val_loss: 1386883.1250
Epoch 52/200
140/140 [==============================] - 0s 3ms/step - loss: 1422229.6250 - val_loss: 1395037.5000
Epoch 53/200
140/140 [==============================] - 0s 3ms/step - loss: 1418594.7500 - val_loss: 1386680.8750
Epoch 54/200
140/140 [==============================] - 0s 3ms/step - loss: 1421613.5000 - val_loss: 1402909.6250
Epoch 55/200
140/140 [==============================] - 0s 3ms/step - loss: 1420715.5000 - val_loss: 1390603.1250
Epoch 56/200
140/140 [==============================] - 0s 3ms/step - loss: 1409495.8750 - val_loss: 1396955.7500
Epoch 57/200
140/140 [==============================] - 0s 3ms/step - loss: 1415390.0000 - val_loss: 1390761.0000
Epoch 58/200
140/140 [==============================] - 0s 3ms/step - loss: 1423115.2500 - val_loss: 1389312.0000
Epoch 59/200
140/140 [==============================] - 0s 3ms/step - loss: 1416375.8750 - val_loss: 1388101.8750
Epoch 60/200
140/140 [==============================] - 0s 3ms/step - loss: 1411909.1250 - val_loss: 1392200.2500
Epoch 61/200
140/140 [==============================] - 0s 3ms/step - loss: 1419872.3750 - val_loss: 1389758.2500
Epoch 62/200
140/140 [==============================] - 0s 3ms/step - loss: 1418682.6250 - val_loss: 1385804.7500
Epoch 63/200
140/140 [==============================] - 0s 3ms/step - loss: 1415639.5000 - val_loss: 1384303.8750
Epoch 64/200
140/140 [==============================] - 0s 3ms/step - loss: 1410429.2500 - val_loss: 1392434.8750
Epoch 65/200
140/140 [==============================] - 0s 3ms/step - loss: 1420559.0000 - val_loss: 1388921.3750
Epoch 66/200
140/140 [==============================] - 0s 3ms/step - loss: 1419026.2500 - val_loss: 1391052.6250
Epoch 67/200
140/140 [==============================] - 0s 3ms/step - loss: 1418475.2500 - val_loss: 1385793.3750
Epoch 68/200
140/140 [==============================] - 0s 3ms/step - loss: 1413581.2500 - val_loss: 1388164.2500
Epoch 69/200
140/140 [==============================] - 0s 3ms/step - loss: 1417832.0000 - val_loss: 1379785.6250
Epoch 70/200
140/140 [==============================] - 1s 4ms/step - loss: 1414860.3750 - val_loss: 1383002.0000
Epoch 71/200
140/140 [==============================] - 1s 4ms/step - loss: 1421853.2500 - val_loss: 1386810.2500
Epoch 72/200
140/140 [==============================] - 0s 3ms/step - loss: 1413392.3750 - val_loss: 1393864.0000
Epoch 73/200
140/140 [==============================] - 0s 3ms/step - loss: 1410971.2500 - val_loss: 1379773.6250
Epoch 74/200
140/140 [==============================] - 0s 3ms/step - loss: 1421388.0000 - val_loss: 1378187.1250
Epoch 75/200
140/140 [==============================] - 0s 3ms/step - loss: 1411644.6250 - val_loss: 1379233.0000
Epoch 76/200
140/140 [==============================] - 0s 3ms/step - loss: 1416345.5000 - val_loss: 1384326.6250
Epoch 77/200
140/140 [==============================] - 0s 3ms/step - loss: 1413238.7500 - val_loss: 1377591.3750
Epoch 78/200
140/140 [==============================] - 0s 3ms/step - loss: 1415508.8750 - val_loss: 1379196.1250
Epoch 79/200
140/140 [==============================] - 0s 3ms/step - loss: 1416322.5000 - val_loss: 1383915.7500
Epoch 80/200
140/140 [==============================] - 0s 3ms/step - loss: 1415723.3750 - val_loss: 1376945.3750
Epoch 81/200
140/140 [==============================] - 0s 3ms/step - loss: 1412045.3750 - val_loss: 1380170.5000
Epoch 82/200
140/140 [==============================] - 0s 3ms/step - loss: 1411410.3750 - val_loss: 1385965.7500
Epoch 83/200
140/140 [==============================] - 0s 3ms/step - loss: 1411381.5000 - val_loss: 1384564.0000
Epoch 84/200
140/140 [==============================] - 0s 3ms/step - loss: 1416947.5000 - val_loss: 1393376.2500
Epoch 85/200
140/140 [==============================] - 0s 3ms/step - loss: 1418695.5000 - val_loss: 1387181.8750
Epoch 86/200
140/140 [==============================] - 0s 3ms/step - loss: 1414825.7500 - val_loss: 1378964.1250
Epoch 87/200
140/140 [==============================] - 0s 3ms/step - loss: 1411720.6250 - val_loss: 1385548.0000
Epoch 88/200
140/140 [==============================] - 0s 3ms/step - loss: 1413335.5000 - val_loss: 1373881.7500
Epoch 89/200
140/140 [==============================] - 0s 3ms/step - loss: 1411872.2500 - val_loss: 1381388.5000
Epoch 90/200
140/140 [==============================] - 0s 3ms/step - loss: 1412426.8750 - val_loss: 1385041.0000
Epoch 91/200
140/140 [==============================] - 0s 3ms/step - loss: 1416321.6250 - val_loss: 1388642.5000
Epoch 92/200
140/140 [==============================] - 0s 3ms/step - loss: 1416138.6250 - val_loss: 1378641.3750
Epoch 93/200
140/140 [==============================] - 0s 3ms/step - loss: 1414557.2500 - val_loss: 1380414.1250
Epoch 94/200
140/140 [==============================] - 0s 3ms/step - loss: 1410993.2500 - val_loss: 1378496.1250
Epoch 95/200
140/140 [==============================] - 0s 3ms/step - loss: 1407131.5000 - val_loss: 1387452.3750
Epoch 96/200
140/140 [==============================] - 0s 3ms/step - loss: 1420842.5000 - val_loss: 1374634.0000
Epoch 97/200
140/140 [==============================] - 0s 3ms/step - loss: 1410150.7500 - val_loss: 1392747.5000
Epoch 98/200
140/140 [==============================] - 0s 3ms/step - loss: 1415643.8750 - val_loss: 1393514.1250
Epoch 99/200
140/140 [==============================] - 0s 3ms/step - loss: 1407695.1250 - val_loss: 1384049.3750
Epoch 100/200
140/140 [==============================] - 0s 3ms/step - loss: 1413704.6250 - val_loss: 1378549.3750
Epoch 101/200
140/140 [==============================] - 0s 3ms/step - loss: 1412818.1250 - val_loss: 1376930.3750
Epoch 102/200
140/140 [==============================] - 0s 3ms/step - loss: 1410818.0000 - val_loss: 1391440.8750
Epoch 103/200
140/140 [==============================] - 0s 3ms/step - loss: 1413578.1250 - val_loss: 1376327.2500
Epoch 104/200
140/140 [==============================] - 0s 3ms/step - loss: 1416704.7500 - val_loss: 1379314.2500
Epoch 105/200
140/140 [==============================] - 0s 3ms/step - loss: 1411270.7500 - val_loss: 1383539.8750
Epoch 106/200
140/140 [==============================] - 0s 3ms/step - loss: 1410727.5000 - val_loss: 1388346.6250
Epoch 107/200
140/140 [==============================] - 0s 3ms/step - loss: 1411402.3750 - val_loss: 1394155.0000
Epoch 108/200
140/140 [==============================] - 0s 3ms/step - loss: 1414585.2500 - val_loss: 1386153.5000
Epoch 109/200
140/140 [==============================] - 0s 3ms/step - loss: 1417964.2500 - val_loss: 1375447.0000
Epoch 110/200
140/140 [==============================] - 0s 3ms/step - loss: 1415428.0000 - val_loss: 1378842.7500
Epoch 111/200
140/140 [==============================] - 0s 3ms/step - loss: 1408504.0000 - val_loss: 1377256.5000
Epoch 112/200
140/140 [==============================] - 0s 3ms/step - loss: 1414037.7500 - val_loss: 1392968.6250
Epoch 113/200
140/140 [==============================] - 0s 3ms/step - loss: 1412768.7500 - val_loss: 1380055.1250
Epoch 114/200
140/140 [==============================] - 0s 3ms/step - loss: 1414147.5000 - val_loss: 1381223.2500
Epoch 115/200
140/140 [==============================] - 0s 3ms/step - loss: 1409623.8750 - val_loss: 1389494.6250
Epoch 116/200
140/140 [==============================] - 0s 3ms/step - loss: 1417487.0000 - val_loss: 1378832.8750
Epoch 117/200
140/140 [==============================] - 0s 3ms/step - loss: 1418738.1250 - val_loss: 1379044.1250
Epoch 118/200
140/140 [==============================] - 0s 3ms/step - loss: 1416121.8750 - val_loss: 1383180.5000
Epoch 119/200
140/140 [==============================] - 0s 3ms/step - loss: 1407828.8750 - val_loss: 1387318.0000
Epoch 120/200
140/140 [==============================] - 1s 4ms/step - loss: 1408600.2500 - val_loss: 1381838.3750
Epoch 121/200
140/140 [==============================] - 0s 3ms/step - loss: 1407783.1250 - val_loss: 1385853.1250
Epoch 122/200
140/140 [==============================] - 0s 3ms/step - loss: 1407666.2500 - val_loss: 1384271.5000
Epoch 123/200
140/140 [==============================] - 0s 3ms/step - loss: 1415026.8750 - val_loss: 1381171.6250
Epoch 124/200
140/140 [==============================] - 0s 3ms/step - loss: 1413857.2500 - val_loss: 1379906.7500
Epoch 125/200
140/140 [==============================] - 0s 3ms/step - loss: 1409571.8750 - val_loss: 1374709.1250
Epoch 126/200
140/140 [==============================] - 0s 3ms/step - loss: 1413445.5000 - val_loss: 1391138.1250
Epoch 127/200
140/140 [==============================] - 0s 3ms/step - loss: 1414078.3750 - val_loss: 1378408.1250
Epoch 128/200
140/140 [==============================] - 0s 3ms/step - loss: 1411784.7500 - val_loss: 1381051.8750
Epoch 129/200
140/140 [==============================] - 0s 4ms/step - loss: 1408556.6250 - val_loss: 1391740.6250
Epoch 130/200
140/140 [==============================] - 0s 3ms/step - loss: 1410047.0000 - val_loss: 1383879.3750
Epoch 131/200
140/140 [==============================] - 1s 4ms/step - loss: 1409422.6250 - val_loss: 1375020.5000
Epoch 132/200
140/140 [==============================] - 0s 3ms/step - loss: 1411453.1250 - val_loss: 1374547.0000
Epoch 133/200
140/140 [==============================] - 0s 3ms/step - loss: 1410211.2500 - val_loss: 1384825.2500
Epoch 134/200
140/140 [==============================] - 0s 3ms/step - loss: 1412117.5000 - val_loss: 1381737.1250
Epoch 135/200
140/140 [==============================] - 0s 3ms/step - loss: 1407782.5000 - val_loss: 1381594.3750
Epoch 136/200
140/140 [==============================] - 0s 3ms/step - loss: 1408939.5000 - val_loss: 1377287.6250
Epoch 137/200
140/140 [==============================] - 0s 3ms/step - loss: 1408129.3750 - val_loss: 1386878.3750
Epoch 138/200
140/140 [==============================] - 0s 3ms/step - loss: 1412466.7500 - val_loss: 1383745.0000
Epoch 139/200
140/140 [==============================] - 0s 3ms/step - loss: 1414378.5000 - val_loss: 1382952.7500
Epoch 140/200
140/140 [==============================] - 0s 3ms/step - loss: 1410192.3750 - val_loss: 1386490.3750
Epoch 141/200
140/140 [==============================] - 0s 3ms/step - loss: 1413894.6250 - val_loss: 1385054.1250
Epoch 142/200
140/140 [==============================] - 0s 3ms/step - loss: 1413429.3750 - val_loss: 1384052.0000
Epoch 143/200
140/140 [==============================] - 0s 3ms/step - loss: 1410066.5000 - val_loss: 1382996.3750
Epoch 144/200
140/140 [==============================] - 0s 3ms/step - loss: 1413406.7500 - val_loss: 1382818.1250
Epoch 145/200
140/140 [==============================] - 0s 3ms/step - loss: 1408572.1250 - val_loss: 1381852.0000
Epoch 146/200
140/140 [==============================] - 1s 4ms/step - loss: 1412500.7500 - val_loss: 1378544.8750
Epoch 147/200
140/140 [==============================] - 0s 4ms/step - loss: 1409030.7500 - val_loss: 1378565.6250
Epoch 148/200
140/140 [==============================] - 0s 3ms/step - loss: 1410710.2500 - val_loss: 1389443.5000
Epoch 149/200
140/140 [==============================] - 1s 4ms/step - loss: 1412880.6250 - val_loss: 1380630.3750
Epoch 150/200
140/140 [==============================] - 0s 3ms/step - loss: 1416062.6250 - val_loss: 1378146.8750
Epoch 151/200
140/140 [==============================] - 1s 5ms/step - loss: 1407782.5000 - val_loss: 1375746.7500
Epoch 152/200
140/140 [==============================] - 0s 3ms/step - loss: 1409495.7500 - val_loss: 1387513.6250
Epoch 153/200
140/140 [==============================] - 0s 3ms/step - loss: 1410602.2500 - val_loss: 1376398.0000
Epoch 154/200
140/140 [==============================] - 0s 3ms/step - loss: 1411832.7500 - val_loss: 1395199.2500
Epoch 155/200
140/140 [==============================] - 0s 3ms/step - loss: 1410691.5000 - val_loss: 1385729.5000
Epoch 156/200
140/140 [==============================] - 1s 4ms/step - loss: 1416502.3750 - val_loss: 1382986.7500
Epoch 157/200
140/140 [==============================] - 0s 3ms/step - loss: 1410088.1250 - val_loss: 1384353.1250
Epoch 158/200
140/140 [==============================] - 0s 3ms/step - loss: 1412727.5000 - val_loss: 1378142.3750
Epoch 159/200
140/140 [==============================] - 1s 5ms/step - loss: 1406201.0000 - val_loss: 1383531.3750
Epoch 160/200
140/140 [==============================] - 0s 3ms/step - loss: 1409225.6250 - val_loss: 1382607.5000
Epoch 161/200
140/140 [==============================] - 0s 3ms/step - loss: 1409858.3750 - val_loss: 1393526.1250
Epoch 162/200
140/140 [==============================] - 0s 3ms/step - loss: 1412272.5000 - val_loss: 1378026.6250
Epoch 163/200
140/140 [==============================] - 0s 3ms/step - loss: 1410637.3750 - val_loss: 1385345.2500
Epoch 164/200
140/140 [==============================] - 0s 3ms/step - loss: 1408113.0000 - val_loss: 1374428.6250
Epoch 165/200
140/140 [==============================] - 0s 3ms/step - loss: 1408842.6250 - val_loss: 1380631.7500
Epoch 166/200
140/140 [==============================] - 1s 4ms/step - loss: 1410546.3750 - val_loss: 1386066.0000
Epoch 167/200
140/140 [==============================] - 0s 3ms/step - loss: 1416270.8750 - val_loss: 1382978.5000
Epoch 168/200
140/140 [==============================] - 0s 3ms/step - loss: 1415044.1250 - val_loss: 1375656.6250
Epoch 169/200
140/140 [==============================] - 0s 3ms/step - loss: 1413258.2500 - val_loss: 1379448.5000
Epoch 170/200
140/140 [==============================] - 0s 3ms/step - loss: 1405510.6250 - val_loss: 1378933.5000
Epoch 171/200
140/140 [==============================] - 1s 4ms/step - loss: 1409555.3750 - val_loss: 1389236.7500
Epoch 172/200
140/140 [==============================] - 0s 3ms/step - loss: 1406037.2500 - val_loss: 1380694.0000
Epoch 173/200
140/140 [==============================] - 0s 3ms/step - loss: 1406770.3750 - val_loss: 1384688.2500
Epoch 174/200
140/140 [==============================] - 0s 3ms/step - loss: 1409776.3750 - val_loss: 1384004.1250
Epoch 175/200
140/140 [==============================] - 1s 4ms/step - loss: 1415245.2500 - val_loss: 1399222.5000
Epoch 176/200
140/140 [==============================] - 0s 3ms/step - loss: 1415514.2500 - val_loss: 1377263.2500
Epoch 177/200
140/140 [==============================] - 0s 3ms/step - loss: 1414623.2500 - val_loss: 1379547.3750
Epoch 178/200
140/140 [==============================] - 0s 3ms/step - loss: 1404499.0000 - val_loss: 1380912.8750
Epoch 179/200
140/140 [==============================] - 0s 4ms/step - loss: 1409632.7500 - val_loss: 1376680.6250
Epoch 180/200
140/140 [==============================] - 0s 3ms/step - loss: 1411043.5000 - val_loss: 1377313.7500
Epoch 181/200
140/140 [==============================] - 0s 3ms/step - loss: 1408492.5000 - val_loss: 1388759.8750
Epoch 182/200
140/140 [==============================] - 0s 3ms/step - loss: 1403291.8750 - val_loss: 1390328.7500
Epoch 183/200
140/140 [==============================] - 0s 3ms/step - loss: 1409988.8750 - val_loss: 1381556.0000
Epoch 184/200
140/140 [==============================] - 0s 3ms/step - loss: 1414961.2500 - val_loss: 1379068.7500
Epoch 185/200
140/140 [==============================] - 1s 4ms/step - loss: 1410968.0000 - val_loss: 1378419.1250
Epoch 186/200
140/140 [==============================] - 0s 3ms/step - loss: 1406519.7500 - val_loss: 1384269.7500
Epoch 187/200
140/140 [==============================] - 0s 3ms/step - loss: 1405578.7500 - val_loss: 1379195.1250
Epoch 188/200
140/140 [==============================] - 0s 3ms/step - loss: 1410766.1250 - val_loss: 1383883.0000
Epoch 189/200
140/140 [==============================] - 0s 3ms/step - loss: 1408213.1250 - val_loss: 1376011.1250
Epoch 190/200
140/140 [==============================] - 0s 3ms/step - loss: 1411512.1250 - val_loss: 1381462.3750
Epoch 191/200
140/140 [==============================] - 1s 4ms/step - loss: 1412263.7500 - val_loss: 1383373.1250
Epoch 192/200
140/140 [==============================] - 1s 4ms/step - loss: 1409278.3750 - val_loss: 1377394.8750
Epoch 193/200
140/140 [==============================] - 0s 4ms/step - loss: 1406527.3750 - val_loss: 1376350.8750
Epoch 194/200
140/140 [==============================] - 0s 3ms/step - loss: 1404841.1250 - val_loss: 1382095.1250
Epoch 195/200
140/140 [==============================] - 0s 4ms/step - loss: 1408023.6250 - val_loss: 1385241.1250
Epoch 196/200
140/140 [==============================] - 0s 3ms/step - loss: 1412890.0000 - val_loss: 1378528.6250
Epoch 197/200
140/140 [==============================] - 1s 4ms/step - loss: 1410367.5000 - val_loss: 1381460.5000
Epoch 198/200
140/140 [==============================] - 0s 3ms/step - loss: 1411713.1250 - val_loss: 1380817.0000
Epoch 199/200
140/140 [==============================] - 0s 3ms/step - loss: 1411654.6250 - val_loss: 1374923.2500
Epoch 200/200
140/140 [==============================] - 0s 3ms/step - loss: 1411264.5000 - val_loss: 1385575.8750
In [110]:
encoding_strategy = 'gloveSynopsis' + encoding_strategy
In [111]:
gv = fe.GloveAutoEncoder()
X_train_synopsis = gv.create_glove_encoding(docs=X_train['Synopsis'],\
                      col_name='Synopis',\
                      max_length=100,\
                      epochs=200,
                      learning_rate=.5)
There are 41290 words in data
Indexing word vector
Indexed 400000 words
Here is shape of embedding matrix (41291, 100)
Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          4129100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 2)            602       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 33, 2)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 66)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 66)                264       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6700      
=================================================================
Total params: 4,136,666
Trainable params: 7,434
Non-trainable params: 4,129,232
_________________________________________________________________
None
Epoch 1/200
140/140 [==============================] - 2s 11ms/step - loss: 33589080.0000 - val_loss: 29825470.0000
Epoch 2/200
140/140 [==============================] - 1s 10ms/step - loss: 29547620.0000 - val_loss: 28771122.0000
Epoch 3/200
140/140 [==============================] - 1s 10ms/step - loss: 29196498.0000 - val_loss: 28265994.0000
Epoch 4/200
140/140 [==============================] - 2s 12ms/step - loss: 29044192.0000 - val_loss: 28360956.0000
Epoch 5/200
140/140 [==============================] - 1s 10ms/step - loss: 28894320.0000 - val_loss: 28229644.0000
Epoch 6/200
140/140 [==============================] - 2s 12ms/step - loss: 28872652.0000 - val_loss: 27980306.0000
Epoch 7/200
140/140 [==============================] - 1s 10ms/step - loss: 28832644.0000 - val_loss: 28039410.0000
Epoch 8/200
140/140 [==============================] - 1s 10ms/step - loss: 28737330.0000 - val_loss: 27858540.0000
Epoch 9/200
140/140 [==============================] - 1s 11ms/step - loss: 28671658.0000 - val_loss: 27761806.0000
Epoch 10/200
140/140 [==============================] - 1s 10ms/step - loss: 28588428.0000 - val_loss: 27753770.0000
Epoch 11/200
140/140 [==============================] - 2s 12ms/step - loss: 28532792.0000 - val_loss: 27774580.0000
Epoch 12/200
140/140 [==============================] - 2s 11ms/step - loss: 28497664.0000 - val_loss: 27920842.0000
Epoch 13/200
140/140 [==============================] - 1s 10ms/step - loss: 28473978.0000 - val_loss: 27868090.0000
Epoch 14/200
140/140 [==============================] - 2s 12ms/step - loss: 28384658.0000 - val_loss: 27538928.0000
Epoch 15/200
140/140 [==============================] - 2s 12ms/step - loss: 27773314.0000 - val_loss: 26636848.0000
Epoch 16/200
140/140 [==============================] - 1s 10ms/step - loss: 26898932.0000 - val_loss: 25507738.0000
Epoch 17/200
140/140 [==============================] - 2s 12ms/step - loss: 26293636.0000 - val_loss: 25185436.0000
Epoch 18/200
140/140 [==============================] - 2s 12ms/step - loss: 25913966.0000 - val_loss: 24764764.0000
Epoch 19/200
140/140 [==============================] - 1s 10ms/step - loss: 25702590.0000 - val_loss: 24631594.0000
Epoch 20/200
140/140 [==============================] - 2s 12ms/step - loss: 25592992.0000 - val_loss: 24788794.0000
Epoch 21/200
140/140 [==============================] - 1s 10ms/step - loss: 25503568.0000 - val_loss: 24497528.0000
Epoch 22/200
140/140 [==============================] - 1s 10ms/step - loss: 25434928.0000 - val_loss: 24357152.0000
Epoch 23/200
140/140 [==============================] - 1s 10ms/step - loss: 25390006.0000 - val_loss: 24296654.0000
Epoch 24/200
140/140 [==============================] - 2s 12ms/step - loss: 25351350.0000 - val_loss: 24266258.0000
Epoch 25/200
140/140 [==============================] - 1s 10ms/step - loss: 25298566.0000 - val_loss: 24231506.0000
Epoch 26/200
140/140 [==============================] - 2s 12ms/step - loss: 25289558.0000 - val_loss: 24181332.0000
Epoch 27/200
140/140 [==============================] - 1s 10ms/step - loss: 25236118.0000 - val_loss: 24339334.0000
Epoch 28/200
140/140 [==============================] - 1s 10ms/step - loss: 25245782.0000 - val_loss: 24338056.0000
Epoch 29/200
140/140 [==============================] - 2s 12ms/step - loss: 25269464.0000 - val_loss: 24244768.0000
Epoch 30/200
140/140 [==============================] - 2s 12ms/step - loss: 25246296.0000 - val_loss: 24265098.0000
Epoch 31/200
140/140 [==============================] - 1s 10ms/step - loss: 25257230.0000 - val_loss: 24216128.0000
Epoch 32/200
140/140 [==============================] - 1s 10ms/step - loss: 25213176.0000 - val_loss: 24163282.0000
Epoch 33/200
140/140 [==============================] - 1s 10ms/step - loss: 25198948.0000 - val_loss: 24249424.0000
Epoch 34/200
140/140 [==============================] - 2s 12ms/step - loss: 25243846.0000 - val_loss: 24191140.0000
Epoch 35/200
140/140 [==============================] - 1s 10ms/step - loss: 25174914.0000 - val_loss: 24409926.0000
Epoch 36/200
140/140 [==============================] - 2s 12ms/step - loss: 25230340.0000 - val_loss: 24206158.0000
Epoch 37/200
140/140 [==============================] - 1s 10ms/step - loss: 25171528.0000 - val_loss: 24164386.0000
Epoch 38/200
140/140 [==============================] - 1s 10ms/step - loss: 25158380.0000 - val_loss: 24131876.0000
Epoch 39/200
140/140 [==============================] - 1s 10ms/step - loss: 25195262.0000 - val_loss: 24110670.0000
Epoch 40/200
140/140 [==============================] - 2s 12ms/step - loss: 25164312.0000 - val_loss: 24121994.0000
Epoch 41/200
140/140 [==============================] - 1s 10ms/step - loss: 25143986.0000 - val_loss: 24129910.0000
Epoch 42/200
140/140 [==============================] - 1s 10ms/step - loss: 25140514.0000 - val_loss: 24183490.0000
Epoch 43/200
140/140 [==============================] - 2s 12ms/step - loss: 25120500.0000 - val_loss: 24155144.0000
Epoch 44/200
140/140 [==============================] - 2s 12ms/step - loss: 25159058.0000 - val_loss: 24089804.0000
Epoch 45/200
140/140 [==============================] - 1s 10ms/step - loss: 25111332.0000 - val_loss: 24072804.0000
Epoch 46/200
140/140 [==============================] - 1s 10ms/step - loss: 25098182.0000 - val_loss: 24101980.0000
Epoch 47/200
140/140 [==============================] - 1s 10ms/step - loss: 25132360.0000 - val_loss: 24063314.0000
Epoch 48/200
140/140 [==============================] - 1s 10ms/step - loss: 25090302.0000 - val_loss: 24084076.0000
Epoch 49/200
140/140 [==============================] - 2s 11ms/step - loss: 25083562.0000 - val_loss: 24154540.0000
Epoch 50/200
140/140 [==============================] - 2s 12ms/step - loss: 25090064.0000 - val_loss: 24062230.0000
Epoch 51/200
140/140 [==============================] - 1s 11ms/step - loss: 25088702.0000 - val_loss: 24128366.0000
Epoch 52/200
140/140 [==============================] - 1s 10ms/step - loss: 25103670.0000 - val_loss: 24072392.0000
Epoch 53/200
140/140 [==============================] - 1s 10ms/step - loss: 25081176.0000 - val_loss: 24137738.0000
Epoch 54/200
140/140 [==============================] - 1s 10ms/step - loss: 25064358.0000 - val_loss: 24077438.0000
Epoch 55/200
140/140 [==============================] - 1s 10ms/step - loss: 25056754.0000 - val_loss: 24236936.0000
Epoch 56/200
140/140 [==============================] - 1s 10ms/step - loss: 25053630.0000 - val_loss: 24051594.0000
Epoch 57/200
140/140 [==============================] - 1s 10ms/step - loss: 25066066.0000 - val_loss: 24039072.0000
Epoch 58/200
140/140 [==============================] - 1s 10ms/step - loss: 25026418.0000 - val_loss: 24032152.0000
Epoch 59/200
140/140 [==============================] - 2s 12ms/step - loss: 25028644.0000 - val_loss: 24239674.0000
Epoch 60/200
140/140 [==============================] - 1s 10ms/step - loss: 25036854.0000 - val_loss: 24098346.0000
Epoch 61/200
140/140 [==============================] - 2s 12ms/step - loss: 25017438.0000 - val_loss: 24068656.0000
Epoch 62/200
140/140 [==============================] - 2s 12ms/step - loss: 25021338.0000 - val_loss: 24044558.0000
Epoch 63/200
140/140 [==============================] - 2s 12ms/step - loss: 25009218.0000 - val_loss: 24063560.0000
Epoch 64/200
140/140 [==============================] - 1s 10ms/step - loss: 25027928.0000 - val_loss: 24031340.0000
Epoch 65/200
140/140 [==============================] - 1s 10ms/step - loss: 25028322.0000 - val_loss: 24063474.0000
Epoch 66/200
140/140 [==============================] - 1s 10ms/step - loss: 25009096.0000 - val_loss: 24043090.0000
Epoch 67/200
140/140 [==============================] - 2s 12ms/step - loss: 25027742.0000 - val_loss: 23987802.0000
Epoch 68/200
140/140 [==============================] - 2s 11ms/step - loss: 24991822.0000 - val_loss: 24008476.0000
Epoch 69/200
140/140 [==============================] - 1s 10ms/step - loss: 25034892.0000 - val_loss: 23975484.0000
Epoch 70/200
140/140 [==============================] - 1s 10ms/step - loss: 25009688.0000 - val_loss: 24109086.0000
Epoch 71/200
140/140 [==============================] - 2s 15ms/step - loss: 25017112.0000 - val_loss: 24019530.0000
Epoch 72/200
140/140 [==============================] - 2s 12ms/step - loss: 24999584.0000 - val_loss: 24083238.0000
Epoch 73/200
140/140 [==============================] - 1s 11ms/step - loss: 24998714.0000 - val_loss: 24030368.0000
Epoch 74/200
140/140 [==============================] - 2s 12ms/step - loss: 25001960.0000 - val_loss: 24013250.0000
Epoch 75/200
140/140 [==============================] - 1s 10ms/step - loss: 25006914.0000 - val_loss: 24026774.0000
Epoch 76/200
140/140 [==============================] - 2s 12ms/step - loss: 24995600.0000 - val_loss: 24037678.0000
Epoch 77/200
140/140 [==============================] - 2s 12ms/step - loss: 24968908.0000 - val_loss: 24037932.0000
Epoch 78/200
140/140 [==============================] - 2s 12ms/step - loss: 25012318.0000 - val_loss: 24068870.0000
Epoch 79/200
140/140 [==============================] - 2s 12ms/step - loss: 25020984.0000 - val_loss: 24024044.0000
Epoch 80/200
140/140 [==============================] - 1s 10ms/step - loss: 24968216.0000 - val_loss: 24012994.0000
Epoch 81/200
140/140 [==============================] - 2s 12ms/step - loss: 24989004.0000 - val_loss: 24056818.0000
Epoch 82/200
140/140 [==============================] - 2s 12ms/step - loss: 24988892.0000 - val_loss: 24036110.0000
Epoch 83/200
140/140 [==============================] - 2s 12ms/step - loss: 24982414.0000 - val_loss: 24018292.0000
Epoch 84/200
140/140 [==============================] - 1s 10ms/step - loss: 24989106.0000 - val_loss: 24028604.0000
Epoch 85/200
140/140 [==============================] - 1s 10ms/step - loss: 24989838.0000 - val_loss: 24011662.0000
Epoch 86/200
140/140 [==============================] - 2s 12ms/step - loss: 25015292.0000 - val_loss: 24021360.0000
Epoch 87/200
140/140 [==============================] - 1s 10ms/step - loss: 25003244.0000 - val_loss: 24034000.0000
Epoch 88/200
140/140 [==============================] - 1s 10ms/step - loss: 24994358.0000 - val_loss: 23976306.0000
Epoch 89/200
140/140 [==============================] - 2s 12ms/step - loss: 24996060.0000 - val_loss: 23980430.0000
Epoch 90/200
140/140 [==============================] - 2s 12ms/step - loss: 25010276.0000 - val_loss: 24012378.0000
Epoch 91/200
140/140 [==============================] - 1s 10ms/step - loss: 25005128.0000 - val_loss: 24080716.0000
Epoch 92/200
140/140 [==============================] - 2s 12ms/step - loss: 24970798.0000 - val_loss: 24020532.0000
Epoch 93/200
140/140 [==============================] - 2s 11ms/step - loss: 24969358.0000 - val_loss: 23966258.0000
Epoch 94/200
140/140 [==============================] - 2s 11ms/step - loss: 24994840.0000 - val_loss: 24019220.0000
Epoch 95/200
140/140 [==============================] - 1s 11ms/step - loss: 24992598.0000 - val_loss: 23979300.0000
Epoch 96/200
140/140 [==============================] - 2s 11ms/step - loss: 25008624.0000 - val_loss: 24040256.0000
Epoch 97/200
140/140 [==============================] - 2s 13ms/step - loss: 24961428.0000 - val_loss: 23999530.0000
Epoch 98/200
140/140 [==============================] - 2s 11ms/step - loss: 24932292.0000 - val_loss: 24025048.0000
Epoch 99/200
140/140 [==============================] - 2s 11ms/step - loss: 24974078.0000 - val_loss: 24026212.0000
Epoch 100/200
140/140 [==============================] - 1s 11ms/step - loss: 25013116.0000 - val_loss: 24188504.0000
Epoch 101/200
140/140 [==============================] - 1s 10ms/step - loss: 24968462.0000 - val_loss: 24133892.0000
Epoch 102/200
140/140 [==============================] - 1s 10ms/step - loss: 24946838.0000 - val_loss: 24082654.0000
Epoch 103/200
140/140 [==============================] - 2s 12ms/step - loss: 24998968.0000 - val_loss: 24088736.0000
Epoch 104/200
140/140 [==============================] - 2s 12ms/step - loss: 24985946.0000 - val_loss: 24012800.0000
Epoch 105/200
140/140 [==============================] - 2s 12ms/step - loss: 24944584.0000 - val_loss: 24275596.0000
Epoch 106/200
140/140 [==============================] - 2s 12ms/step - loss: 24942934.0000 - val_loss: 24010034.0000
Epoch 107/200
140/140 [==============================] - 1s 10ms/step - loss: 24958446.0000 - val_loss: 24031030.0000
Epoch 108/200
140/140 [==============================] - 1s 10ms/step - loss: 24984208.0000 - val_loss: 23946900.0000
Epoch 109/200
140/140 [==============================] - 1s 10ms/step - loss: 24993144.0000 - val_loss: 24010672.0000
Epoch 110/200
140/140 [==============================] - 2s 12ms/step - loss: 24986458.0000 - val_loss: 23955786.0000
Epoch 111/200
140/140 [==============================] - 2s 12ms/step - loss: 24993208.0000 - val_loss: 23964800.0000
Epoch 112/200
140/140 [==============================] - 1s 10ms/step - loss: 25017850.0000 - val_loss: 24013292.0000
Epoch 113/200
140/140 [==============================] - 2s 12ms/step - loss: 24962906.0000 - val_loss: 23992748.0000
Epoch 114/200
140/140 [==============================] - 1s 10ms/step - loss: 24978736.0000 - val_loss: 24021014.0000
Epoch 115/200
140/140 [==============================] - 2s 12ms/step - loss: 24954040.0000 - val_loss: 24025810.0000
Epoch 116/200
140/140 [==============================] - 2s 12ms/step - loss: 24988566.0000 - val_loss: 24043126.0000
Epoch 117/200
140/140 [==============================] - 2s 12ms/step - loss: 24968672.0000 - val_loss: 24070046.0000
Epoch 118/200
140/140 [==============================] - 1s 10ms/step - loss: 24965060.0000 - val_loss: 23969914.0000
Epoch 119/200
140/140 [==============================] - 2s 11ms/step - loss: 24973644.0000 - val_loss: 24076998.0000
Epoch 120/200
140/140 [==============================] - 1s 10ms/step - loss: 24947930.0000 - val_loss: 24132192.0000
Epoch 121/200
140/140 [==============================] - 1s 10ms/step - loss: 25016212.0000 - val_loss: 24008276.0000
Epoch 122/200
140/140 [==============================] - 2s 12ms/step - loss: 24961292.0000 - val_loss: 23987380.0000
Epoch 123/200
140/140 [==============================] - 2s 12ms/step - loss: 24964628.0000 - val_loss: 24053120.0000
Epoch 124/200
140/140 [==============================] - 1s 10ms/step - loss: 24960114.0000 - val_loss: 24102796.0000
Epoch 125/200
140/140 [==============================] - 2s 12ms/step - loss: 25004652.0000 - val_loss: 24071366.0000
Epoch 126/200
140/140 [==============================] - 2s 12ms/step - loss: 24944484.0000 - val_loss: 24067638.0000
Epoch 127/200
140/140 [==============================] - 2s 12ms/step - loss: 24946926.0000 - val_loss: 24074870.0000
Epoch 128/200
140/140 [==============================] - 2s 12ms/step - loss: 25004026.0000 - val_loss: 24048906.0000
Epoch 129/200
140/140 [==============================] - 2s 13ms/step - loss: 24956888.0000 - val_loss: 24058028.0000
Epoch 130/200
140/140 [==============================] - 2s 12ms/step - loss: 24975248.0000 - val_loss: 24021728.0000
Epoch 131/200
140/140 [==============================] - 1s 11ms/step - loss: 24958764.0000 - val_loss: 24141746.0000
Epoch 132/200
140/140 [==============================] - 2s 11ms/step - loss: 24982094.0000 - val_loss: 24001458.0000
Epoch 133/200
140/140 [==============================] - 2s 12ms/step - loss: 24975626.0000 - val_loss: 24130434.0000
Epoch 134/200
140/140 [==============================] - 1s 11ms/step - loss: 24947372.0000 - val_loss: 24018510.0000
Epoch 135/200
140/140 [==============================] - 1s 10ms/step - loss: 24973410.0000 - val_loss: 24110008.0000
Epoch 136/200
140/140 [==============================] - 1s 10ms/step - loss: 24969276.0000 - val_loss: 23991990.0000
Epoch 137/200
140/140 [==============================] - 1s 10ms/step - loss: 24974246.0000 - val_loss: 23970310.0000
Epoch 138/200
140/140 [==============================] - 1s 10ms/step - loss: 24956014.0000 - val_loss: 24016054.0000
Epoch 139/200
140/140 [==============================] - 1s 10ms/step - loss: 24964032.0000 - val_loss: 24011076.0000
Epoch 140/200
140/140 [==============================] - 2s 12ms/step - loss: 24934796.0000 - val_loss: 23999986.0000
Epoch 141/200
140/140 [==============================] - 1s 10ms/step - loss: 24972754.0000 - val_loss: 24083666.0000
Epoch 142/200
140/140 [==============================] - 2s 12ms/step - loss: 25000574.0000 - val_loss: 24003060.0000
Epoch 143/200
140/140 [==============================] - 2s 12ms/step - loss: 24942678.0000 - val_loss: 24005724.0000
Epoch 144/200
140/140 [==============================] - 2s 11ms/step - loss: 24938292.0000 - val_loss: 23952630.0000
Epoch 145/200
140/140 [==============================] - 1s 10ms/step - loss: 24962446.0000 - val_loss: 23975312.0000
Epoch 146/200
140/140 [==============================] - 1s 11ms/step - loss: 24949578.0000 - val_loss: 23965870.0000
Epoch 147/200
140/140 [==============================] - 1s 11ms/step - loss: 24958758.0000 - val_loss: 24031736.0000
Epoch 148/200
140/140 [==============================] - 1s 10ms/step - loss: 24960426.0000 - val_loss: 24002066.0000
Epoch 149/200
140/140 [==============================] - 2s 11ms/step - loss: 24925728.0000 - val_loss: 24064732.0000
Epoch 150/200
140/140 [==============================] - 1s 10ms/step - loss: 24962734.0000 - val_loss: 24006482.0000
Epoch 151/200
140/140 [==============================] - 1s 10ms/step - loss: 24955840.0000 - val_loss: 24037332.0000
Epoch 152/200
140/140 [==============================] - 2s 12ms/step - loss: 24946512.0000 - val_loss: 23965522.0000
Epoch 153/200
140/140 [==============================] - 2s 13ms/step - loss: 24958988.0000 - val_loss: 24006634.0000
Epoch 154/200
140/140 [==============================] - 1s 10ms/step - loss: 24964880.0000 - val_loss: 24029852.0000
Epoch 155/200
140/140 [==============================] - 1s 10ms/step - loss: 24920076.0000 - val_loss: 24070566.0000
Epoch 156/200
140/140 [==============================] - 1s 10ms/step - loss: 24936414.0000 - val_loss: 24069494.0000
Epoch 157/200
140/140 [==============================] - 1s 10ms/step - loss: 24938692.0000 - val_loss: 24120954.0000
Epoch 158/200
140/140 [==============================] - 1s 10ms/step - loss: 24935782.0000 - val_loss: 23971382.0000
Epoch 159/200
140/140 [==============================] - 1s 10ms/step - loss: 24983624.0000 - val_loss: 24071888.0000
Epoch 160/200
140/140 [==============================] - 1s 10ms/step - loss: 24944786.0000 - val_loss: 24080434.0000
Epoch 161/200
140/140 [==============================] - 2s 12ms/step - loss: 24967014.0000 - val_loss: 23969514.0000
Epoch 162/200
140/140 [==============================] - 1s 11ms/step - loss: 24975800.0000 - val_loss: 24013842.0000
Epoch 163/200
140/140 [==============================] - 1s 10ms/step - loss: 24958988.0000 - val_loss: 24037610.0000
Epoch 164/200
140/140 [==============================] - 1s 10ms/step - loss: 24970828.0000 - val_loss: 24078654.0000
Epoch 165/200
140/140 [==============================] - 2s 12ms/step - loss: 24953478.0000 - val_loss: 24037872.0000
Epoch 166/200
140/140 [==============================] - 1s 11ms/step - loss: 24951518.0000 - val_loss: 24048376.0000
Epoch 167/200
140/140 [==============================] - 2s 12ms/step - loss: 24973490.0000 - val_loss: 24045190.0000
Epoch 168/200
140/140 [==============================] - 2s 11ms/step - loss: 24980542.0000 - val_loss: 23998420.0000
Epoch 169/200
140/140 [==============================] - 2s 12ms/step - loss: 24942098.0000 - val_loss: 23958950.0000
Epoch 170/200
140/140 [==============================] - 1s 10ms/step - loss: 24934512.0000 - val_loss: 23966860.0000
Epoch 171/200
140/140 [==============================] - 1s 10ms/step - loss: 24977478.0000 - val_loss: 23969202.0000
Epoch 172/200
140/140 [==============================] - 2s 12ms/step - loss: 24934972.0000 - val_loss: 24065004.0000
Epoch 173/200
140/140 [==============================] - 2s 12ms/step - loss: 24943132.0000 - val_loss: 23996002.0000
Epoch 174/200
140/140 [==============================] - 1s 10ms/step - loss: 24945278.0000 - val_loss: 24106086.0000
Epoch 175/200
140/140 [==============================] - 2s 12ms/step - loss: 24943152.0000 - val_loss: 24023820.0000
Epoch 176/200
140/140 [==============================] - 1s 10ms/step - loss: 24919540.0000 - val_loss: 23997750.0000
Epoch 177/200
140/140 [==============================] - 1s 11ms/step - loss: 24956538.0000 - val_loss: 24014328.0000
Epoch 178/200
140/140 [==============================] - 2s 12ms/step - loss: 24915706.0000 - val_loss: 24006242.0000
Epoch 179/200
140/140 [==============================] - 2s 11ms/step - loss: 24944080.0000 - val_loss: 24138884.0000
Epoch 180/200
140/140 [==============================] - 2s 12ms/step - loss: 24916744.0000 - val_loss: 24006288.0000
Epoch 181/200
140/140 [==============================] - 2s 12ms/step - loss: 24975820.0000 - val_loss: 24023412.0000
Epoch 182/200
140/140 [==============================] - 2s 12ms/step - loss: 24963422.0000 - val_loss: 23941598.0000
Epoch 183/200
140/140 [==============================] - 2s 13ms/step - loss: 24954986.0000 - val_loss: 23993508.0000
Epoch 184/200
140/140 [==============================] - 1s 10ms/step - loss: 24945432.0000 - val_loss: 24017336.0000
Epoch 185/200
140/140 [==============================] - 2s 11ms/step - loss: 24930662.0000 - val_loss: 24020310.0000
Epoch 186/200
140/140 [==============================] - 1s 11ms/step - loss: 24915182.0000 - val_loss: 23985608.0000
Epoch 187/200
140/140 [==============================] - 1s 10ms/step - loss: 24962062.0000 - val_loss: 23952238.0000
Epoch 188/200
140/140 [==============================] - 2s 12ms/step - loss: 24962770.0000 - val_loss: 24066004.0000
Epoch 189/200
140/140 [==============================] - 1s 10ms/step - loss: 24896464.0000 - val_loss: 24018402.0000
Epoch 190/200
140/140 [==============================] - 2s 11ms/step - loss: 24987660.0000 - val_loss: 24043242.0000
Epoch 191/200
140/140 [==============================] - 1s 11ms/step - loss: 24955008.0000 - val_loss: 23986600.0000
Epoch 192/200
140/140 [==============================] - 2s 11ms/step - loss: 24940240.0000 - val_loss: 24055326.0000
Epoch 193/200
140/140 [==============================] - 2s 12ms/step - loss: 24928714.0000 - val_loss: 24033910.0000
Epoch 194/200
140/140 [==============================] - 2s 11ms/step - loss: 24963380.0000 - val_loss: 23988000.0000
Epoch 195/200
140/140 [==============================] - 2s 13ms/step - loss: 24919894.0000 - val_loss: 23949462.0000
Epoch 196/200
140/140 [==============================] - 1s 11ms/step - loss: 24987474.0000 - val_loss: 24026156.0000
Epoch 197/200
140/140 [==============================] - 2s 13ms/step - loss: 24934528.0000 - val_loss: 24031410.0000
Epoch 198/200
140/140 [==============================] - 2s 13ms/step - loss: 24938476.0000 - val_loss: 24007584.0000
Epoch 199/200
140/140 [==============================] - 2s 13ms/step - loss: 24911688.0000 - val_loss: 24069546.0000
Epoch 200/200
140/140 [==============================] - 2s 13ms/step - loss: 24917838.0000 - val_loss: 23991824.0000
In [112]:
X_val_synopsis = gv.transform_glove_encoding(X_val['Synopsis'])
test_data_synopsis = gv.transform_glove_encoding(test_data['Synopsis'])

Final merge

In [113]:
X_train = pd.concat([X_train_std,\
                     X_train_ohe,\
                     X_train_title,\
                     X_train_synopsis],axis=1)
X_val = pd.concat([X_val_std,\
                   X_val_ohe,\
                   X_val_title,\
                   X_val_synopsis],axis=1)
X_test = pd.concat([test_dataset_std,\
                    X_test_ohe,\
                    test_data_title,\
                    test_data_synopsis],axis=1)
logger.debug(f'Shape of X_train is {X_train.shape}')
logger.debug(f'Shape of X_val is {X_val.shape}')
logger.debug(f'Shape of X_test is {X_test.shape}')
DEBUG:root:Shape of X_train is (4463, 254)
DEBUG:root:Shape of X_val is (1872, 254)
DEBUG:root:Shape of X_test is (1560, 254)
In [114]:
eda.explain_data(X_train)
The data has 4463 rows and 254 columns
Below are the column wise data-types,missing values, unique level and descriptive stats of the data
Out[114]:
dtypes missing_values unique_values count mean std min 25% 50% 75% max
title_len float64 0 31 4463.000 0.197 0.142 0.000 0.097 0.161 0.258 1.000
Author_Title_count float64 0 30 4463.000 0.085 0.166 0.000 0.000 0.029 0.087 1.000
Author_LogPrice_amin float64 0 488 4463.000 0.308 0.202 0.000 0.158 0.269 0.472 1.000
Author_LogPrice_mean float64 0 841 4463.000 0.471 0.119 0.000 0.403 0.463 0.554 1.000
Author_LogPrice_amax float64 0 532 4463.000 0.618 0.241 0.000 0.397 0.597 0.838 1.000
... ... ... ... ... ... ... ... ... ... ... ...
Synopis95 float32 0 4061 4463.000 0.332 0.098 0.000 0.276 0.304 0.357 1.000
Synopis96 float32 0 4073 4463.000 0.315 0.146 0.000 0.211 0.268 0.394 1.000
Synopis97 float32 0 4067 4463.000 0.331 0.150 0.000 0.229 0.277 0.411 1.000
Synopis98 float32 0 4063 4463.000 0.247 0.128 0.000 0.172 0.201 0.283 1.000
Synopis99 float32 0 4075 4463.000 0.334 0.143 0.000 0.242 0.320 0.412 1.000

254 rows × 11 columns

Model Building

Linear Regression

OLS
In [115]:
model_desc = 'LinearReg' + encoding_strategy
In [116]:
lr = LinearRegression(normalize=True)
lr.fit(X_train,y_train)
Out[116]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
In [117]:
logger.debug(f'R2 score is {lr.score(X_train,y_train)}')
DEBUG:root:R2 score is 0.598557044001483
In [118]:
x_train_pred = lr.predict(X_train)
x_val_pred = lr.predict(X_val)
In [213]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,\
                 trendline="ols",\
                 hover_data=[y_val.index],\
                 log_y=True)

fig.show()
In [120]:
el.add_metrics(y_train,y_val,X_train,X_val,lr,model_desc)
In [121]:
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,lr.predict(X_test)),\
                         name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
In [122]:
el.get_metrics()
Out[122]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
In [123]:
features = X_train.columns
coef = pd.Series(lr.coef_,index=features)
logger.debug(f"Coef are \n{coef.sort_values()}")
DEBUG:root:Coef are 
year                                                  -11599247366915.854
BookCategory_LogPrice_std                               -576673579954.268
top_Author_x0_hidenorikusaka                            -270813697130.362
top_Author_x0_eiichirooda                               -270813697130.359
top_Author_x0_davpilkey                                 -270813697130.355
                                                              ...        
BookCategory_x0_Comics & Mangas                          189529959275.666
BookCategory_LogPrice_amax                               198470491398.077
BookCategory_x0_Computing, Internet & Digital Media      319015616002.826
BookCategory_x0_Language, Linguistics & Writing          404474106734.458
year_month                                             11606070453602.057
Length: 254, dtype: float64
ElasticNet
In [124]:
model_desc = 'ElasticNet' + encoding_strategy
en = ElasticNet()
params = {
          'alpha':np.linspace(.0001,1,3),
          'l1_ratio':[.7,.9,1],
          'normalize' : [False],
          'selection': ['cyclic','random']  
          }

logger.debug(f"Params are {params}")
DEBUG:root:Params are {'alpha': array([1.0000e-04, 5.0005e-01, 1.0000e+00]), 'l1_ratio': [0.7, 0.9, 1], 'normalize': [False], 'selection': ['cyclic', 'random']}
In [125]:
en = mb.build_model(en,params,X_train,y_train,X_val,model_desc,
               PROJECT_PATH,scoring='neg_mean_squared_log_error',verbose=10,\
               has_sample_weight=0)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1927s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1126s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1889s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done  87 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   13.2s finished
Best parameters for gs are {'alpha': 0.0001, 'l1_ratio': 1, 'normalize': False, 'selection': 'cyclic'}
Here are the Top 100 train results 
[2.76208553 2.754588   2.7919878  2.74737975 2.7082168  2.50301061
 2.92471167 2.85656803 2.47240186 2.55979174 3.13613542 2.43088601
 2.76342789 2.60095717 2.65473885 2.4197747  3.11700214 2.45867098
 2.55453853 2.46415556 2.61025859 2.66631836 3.06447324 2.84568284
 2.39642504 2.896463   2.95820973 2.66407112 2.40088085 2.55986433
 2.57775615 2.73619919 2.59046197 2.01564733 2.99989366 2.43877179
 2.31942241 2.2037193  2.38758251 2.75032718 2.36976048 3.32830287
 2.63404035 2.66154303 3.03405277 2.80751801 2.49625542 2.93896465
 2.68190992 2.80650716 2.71385205 2.15519287 2.37446759 2.36039971
 3.30411891 2.66976404 2.32405706 2.49921491 2.62507705 2.92521748
 2.34924895 2.95754708 2.95364157 2.62233317 2.36265457 2.79200901
 2.37851685 2.54956185 2.80587643 2.99365535 2.65119937 2.77381578
 2.71411229 2.4257735  2.59319847 2.40602446 2.41596799 2.7435412
 2.47005834 2.50060154 2.30242845 2.71916945 2.53619692 2.53387596
 2.59584163 2.65210458 2.87812658 2.95321324 2.45913741 2.62685934
 2.82031635 2.68094222 2.53373212 2.44598987 2.55275299 2.76716604
 2.74925591 2.43344705 3.14088103 2.52432544]
Here are the Top 100 val results 
[2.88166143 2.78796098 2.5724786  2.33132496 2.396681   2.59686509
 2.8831952  2.57478721 2.80099377 2.6742512  2.54185635 2.60475938
 2.34787005 2.4082665  2.66828291 2.33719139 2.68585013 2.75062669
 2.60750555 2.67382504 2.71110285 2.81587175 2.87964562 2.47637773
 2.85302078 2.39920458 2.6619047  2.59987728 2.38385353 2.39263756
 3.00821581 2.49497576 2.54536164 2.57500656 2.78464193 2.67819324
 2.55834661 2.43878161 3.07825679 2.54578177 2.64623599 2.62320149
 2.81463178 2.80224856 2.88357091 2.3948947  2.51429828 2.75850399
 2.79755121 2.6946844  3.20084516 2.41770133 2.78019962 2.83952458
 2.84612795 2.78739322 2.69944795 2.0452282  2.7463596  2.6581472
 2.67144589 2.48864073 2.84042534 2.47978157 2.92989373 2.64278109
 2.38542596 2.54511899 2.80527247 2.86143352 2.50603093 2.54030426
 2.75906826 2.46444827 2.74408519 2.70539993 2.56051077 3.21030744
 3.02367593 2.81165371 2.67521675 2.00665014 2.47565776 2.73626563
 2.61084134 2.6926004  2.87913112 2.30946662 2.72804579 2.5400454
 2.54919851 2.39009547 2.75398499 2.51214908 2.68921762 2.53975025
 2.56975455 2.44215637 2.89170189 2.54313069]
In [126]:
x_train_pred = en.predict(X_train)
x_val_pred = en.predict(X_val)
In [127]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,\
                 trendline="ols",\
                 hover_data=[y_val.index])

fig.show()
In [128]:
el.add_metrics(y_train,y_val,X_train,X_val,en,model_desc)
In [129]:
el.get_metrics()
Out[129]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
In [130]:
el.save_data(PROJECT_PATH)
In [131]:
features = X_train.columns
coef = pd.Series(en.coef_,index=features)
logger.debug(f"Coef are \n{coef.sort_values(ascending=False)}")
DEBUG:root:Coef are 
Author_LogPrice_mean          1.569
Genre_LogPrice_mean           0.695
synopsis_len                  0.535
sour_LogPrice_mean            0.446
bind_LogPrice_amax            0.332
                              ...  
BookCategory_LogPrice_mean   -0.069
bind_Title_count             -0.099
year_month                   -0.140
bind_LogPrice_std            -0.158
rating_num                   -0.252
Length: 254, dtype: float64

DecisionTree

In [132]:
model_desc = 'DecisionTree' + encoding_strategy
dt = DecisionTreeRegressor()
params = {
          'criterion' : ['mse'], #['mse','poisson','mae'],
          'min_samples_split' : [2], #np.linspace(2,42,3,dtype='int64'),
          'min_samples_leaf' : [42], #np.linspace(22,62,3,dtype='int64'),
          'max_depth': [5,10,None],
          'min_impurity_decrease': np.linspace(0,3,2,dtype='int64')  
         }
logger.debug(params)
DEBUG:root:{'criterion': ['mse'], 'min_samples_split': [2], 'min_samples_leaf': [42], 'max_depth': [5, 10, None], 'min_impurity_decrease': array([0, 3])}
In [133]:
dt = mb.build_model(dt,params,X_train,y_train,X_val,model_desc,
               PROJECT_PATH,scoring='neg_mean_squared_log_error',
               verbose=10)
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    4.3s finished
Best parameters for gs are {'criterion': 'mse', 'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_leaf': 42, 'min_samples_split': 2}
Here are the Top 100 train results 
[2.63648606 2.71493384 2.75174919 2.63648606 2.75174919 2.55191564
 2.87628952 2.41250722 2.52885861 2.55191564 2.87628952 2.41250722
 2.71606293 2.75174919 2.63648606 2.41824784 3.11832867 2.34931962
 2.56137553 2.41824784 2.63648606 2.75174919 3.11832867 2.75174919
 2.52885861 2.97652967 2.8258856  2.55191564 2.41250722 2.63648606
 2.55191564 2.63648606 2.63648606 2.15892431 2.97652967 2.34931962
 2.24993233 2.33268578 2.55191564 2.75174919 2.34931962 3.3891654
 2.63648606 2.63648606 2.87628952 2.87628952 2.41250722 2.8258856
 2.75174919 2.71493384 2.63648606 2.01704701 2.41250722 2.41250722
 3.3891654  2.63648606 2.24993233 2.41250722 2.52885861 2.97652967
 2.41250722 3.2276226  3.2276226  2.75174919 2.34931962 2.63648606
 2.41824784 2.55191564 2.87628952 2.97950788 2.63648606 2.8258856
 2.52885861 2.41250722 2.55191564 2.52885861 2.56137553 2.75174919
 2.52885861 2.41250722 2.34931962 2.63648606 2.52885861 2.52885861
 2.63648606 2.63648606 2.8258856  2.87628952 2.41824784 2.63648606
 2.63648606 2.63648606 2.52885861 2.34931962 2.52885861 2.63648606
 2.75174919 2.52885861 3.11832867 2.55191564]
Here are the Top 100 val results 
[2.71493384 2.75174919 2.55191564 2.34931962 2.41250722 2.55191564
 2.75174919 2.52885861 2.87628952 2.75174919 2.52885861 2.63648606
 2.41250722 2.41824784 2.63648606 2.34931962 2.75174919 2.87628952
 2.55191564 2.63648606 2.75174919 2.63648606 2.75174919 2.56137553
 2.97652967 2.34931962 2.55191564 2.63648606 2.41250722 2.34931962
 2.97950788 2.41250722 2.55191564 2.75174919 2.75174919 2.55191564
 2.63648606 2.41250722 2.99727437 2.55191564 2.75174919 2.52885861
 2.75174919 2.75174919 2.8258856  2.63648606 2.55191564 2.75174919
 2.75174919 2.63648606 2.87628952 2.41824784 2.8258856  2.97652967
 2.75174919 2.87628952 2.75174919 2.13579313 2.63648606 2.52885861
 2.63648606 2.52885861 2.75174919 2.55191564 2.75174919 2.63648606
 2.34931962 2.63648606 2.87628952 2.87628952 2.52885861 2.55191564
 2.75174919 2.41250722 2.75174919 2.63648606 2.75174919 2.99727437
 3.11832867 2.75174919 2.63648606 1.89891365 2.52885861 2.75174919
 2.52885861 2.63648606 2.8258856  2.34931962 2.71606293 2.55191564
 2.52885861 2.33268578 2.75174919 2.55191564 2.75174919 2.55191564
 2.52885861 2.52885861 2.8258856  2.52885861]
In [134]:
x_train_pred = dt.predict(X_train)
x_val_pred = dt.predict(X_val)
In [135]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,\
                 trendline="ols")

fig.show()
In [136]:
el.add_metrics(y_train,y_val,X_train,X_val,dt,model_desc)
In [137]:
el.get_metrics()
Out[137]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
In [138]:
el.save_data(PROJECT_PATH)
In [139]:
mb.plot_feature_importances(X_train,dt)
Out[139]:
feature importance
0 Author_LogPrice_mean 0.845
1 Genre_LogPrice_mean 0.083
2 bind_LogPrice_amin 0.018
3 bind_LogPrice_mean 0.016
4 BookCategory_x0_Language, Linguistics & Writing 0.009
... ... ...
249 top_Author_x0_lucacaioli 0.000
250 top_Author_x0_maplepress 0.000
251 top_Author_x0_masashikishimoto 0.000
252 top_Author_x0_matthewreilly 0.000
253 Synopis99 0.000

254 rows × 2 columns

RandomForest

All features
In [140]:
model_desc = 'RandomForest' + encoding_strategy
rf = RandomForestRegressor()
params = {
          'n_estimators': [75],
          'criterion' : ['mae'], #['mse','mae'],
          'min_samples_split' : [3], #np.linspace(2,20,3,dtype='int64'),
          'min_samples_leaf' : [3], #np.linspace(1,30,4,dtype='int64')  
          'min_impurity_decrease': [0], #np.linspace(0,10,3)
          'max_depth' :  [None],
          'max_features' : [.7] #['auto',.7,'sqrt','log2']
         }
logger.debug(params)
DEBUG:root:{'n_estimators': [75], 'criterion': ['mae'], 'min_samples_split': [3], 'min_samples_leaf': [3], 'min_impurity_decrease': [0], 'max_depth': [None], 'max_features': [0.7]}
In [141]:
rf = mb.build_model(rf,params,X_train,y_train,X_val,model_desc,
                    PROJECT_PATH,scoring='neg_mean_squared_log_error',
                    verbose=10)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 16.9min remaining: 11.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 22.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 22.3min finished
Best parameters for gs are {'criterion': 'mae', 'max_depth': None, 'max_features': 0.7, 'min_impurity_decrease': 0, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 75}
Here are the Top 100 train results 
[2.57280777 2.82440005 2.75500847 2.77935782 2.67459334 2.39685605
 2.72528937 2.70569073 2.51915311 2.51539948 2.94678506 2.47462748
 2.75742023 2.61300319 2.72031189 2.25713247 2.80247048 2.35923825
 2.50772548 2.53515406 2.48441224 2.71710121 3.07251253 2.69605517
 2.4027394  3.0134093  2.88795715 2.62188043 2.4251231  2.58326968
 2.60284608 2.73332048 2.45301335 2.12996078 2.90628316 2.43898114
 2.30881342 2.34655501 2.52632877 2.79097172 2.41901295 3.54869448
 2.60455772 2.60423265 3.00151437 2.78968506 2.34112718 2.89626428
 2.63464475 2.85487389 2.77281516 2.09166037 2.47601862 2.27949604
 3.23530207 2.56154595 2.24926204 2.51884569 2.64925985 2.8251664
 2.30304262 3.16801382 3.20931746 2.64829074 2.31711748 2.84281016
 2.11929548 2.57253006 2.69044635 3.04559288 2.5169136  2.6508591
 2.55276045 2.4369978  2.70159013 2.51576225 2.41916519 2.69533667
 2.5190946  2.55181532 2.26138239 2.65978626 2.62306966 2.57713179
 2.56526168 2.52591762 2.73768426 2.86713313 2.48509279 2.73852636
 2.88515551 2.79109662 2.57663501 2.40193809 2.46038859 2.52572836
 2.97952375 2.52107257 3.27519959 2.49247182]
Here are the Top 100 val results 
[2.75608557 2.6930215  2.53002685 2.33440083 2.42360998 2.60864204
 2.88215066 2.51723037 2.78650905 2.77348191 2.60868097 2.63944603
 2.41704007 2.492715   2.67967177 2.35037864 2.6915594  2.75523498
 2.56982021 2.72770763 2.69131037 2.69487671 2.82162675 2.40468018
 2.88286845 2.35431923 2.65107528 2.61949515 2.36814962 2.38635169
 3.12071796 2.44819771 2.53529347 2.57869226 2.84618659 2.61909877
 2.59499493 2.42601498 3.05083584 2.5824495  2.63899215 2.60149608
 2.7156947  2.85369147 2.73554246 2.54791967 2.51732953 2.90571855
 2.69986452 2.65974433 2.94623375 2.49377122 2.83945657 2.76489231
 2.83672222 2.72032671 2.66199534 2.01964812 2.88728904 2.68490568
 2.65987795 2.46140303 2.77091099 2.4661299  2.92755025 2.63180986
 2.38098355 2.59046479 2.82633884 2.81333735 2.56564354 2.524278
 2.77660307 2.46216775 2.70796711 2.84266816 2.60506876 3.04988342
 3.05670662 2.79802872 2.59193588 1.83363245 2.49351953 2.68807242
 2.57892688 2.67588946 2.93290629 2.32523184 2.49002948 2.57044208
 2.49049046 2.28805594 2.75384592 2.5038227  2.75523481 2.57563308
 2.58100778 2.51510191 2.93920049 2.49010939]
In [142]:
x_train_pred = rf.predict(X_train)
x_val_pred = rf.predict(X_val)
In [143]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,
                 trendline='ols')
fig.show()
In [144]:
el.add_metrics(y_train,y_val,X_train,X_val,rf,model_desc)
In [145]:
el.get_metrics()
Out[145]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
In [146]:
el.save_data(PROJECT_PATH)
In [147]:
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,rf.predict(X_test)),\
                         name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
In [148]:
# #Loading the file.
# file_name = str(model_desc) + '.pkl'
# rf = pickle.load(open(os.\
#                       path.\
#                       join(PROJECT_PATH,\
#                            'model',\
#                            file_name),'rb'))
Top features
In [197]:
top_features = mb.plot_feature_importances(X_train,rf)
In [150]:
logger.debug(f"Top 50 features are \n {top_features[:50]}")
DEBUG:root:Top 50 features are 
                        feature  importance
0         Author_LogPrice_mean       0.359
1          Genre_LogPrice_mean       0.055
2         Author_LogPrice_amin       0.042
3         Author_LogPrice_amax       0.035
4                   rating_num       0.013
5          Author_LogPrice_std       0.009
6                 synopsis_len       0.008
7                   review_num       0.007
8          Genre_LogPrice_amin       0.006
9           Genre_LogPrice_std       0.006
10                      Title2       0.005
11                   Synopis95       0.005
12                      Title0       0.005
13                   Synopis59       0.005
14                   Synopis70       0.005
15                      Title8       0.005
16                   Synopis20       0.005
17                  year_month       0.004
18                   Synopis99       0.004
19                   Synopis17       0.004
20                   Synopis16       0.004
21                   Synopis32       0.004
22                   Synopis43       0.004
23                   Synopis23       0.004
24                   Synopis47       0.004
25                   Synopis26       0.004
26                   Synopis51       0.004
27                   Synopis73       0.004
28                      Title4       0.004
29                   Synopis91       0.004
30                   Synopis66       0.004
31                   Synopis44       0.004
32                    Synopis2       0.004
33                   Synopis31       0.004
34                   Synopis92       0.004
35                   Synopis61       0.004
36  BookCategory_LogPrice_mean       0.004
37                   Synopis87       0.004
38                   Synopis85       0.004
39                   Synopis30       0.004
40                   Synopis55       0.004
41                    Synopis5       0.004
42                   Synopis64       0.004
43                   Synopis38       0.004
44                   Synopis82       0.004
45                      Title1       0.004
46                   Synopis80       0.004
47                   Synopis34       0.004
48                   Synopis86       0.004
49                   Synopis72       0.004
In [151]:
l = int(.50*len(top_features))
top_features = top_features.loc[:l,'feature']
In [152]:
model_desc = 'RandomForestTopFeatures' + encoding_strategy
rf_top = mb.build_model(rf,\
                        params,\
                        X_train[top_features],\
                        y_train,\
                        X_val[top_features],\
                        model_desc,\
                        PROJECT_PATH,\
                        scoring='neg_mean_squared_log_error',
                        verbose=10)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 14.3min remaining:  9.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 18.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 18.9min finished
Best parameters for gs are {'criterion': 'mae', 'max_depth': None, 'max_features': 0.7, 'min_impurity_decrease': 0, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 75}
Here are the Top 100 train results 
[2.576742   2.7481597  2.74112815 2.73689109 2.70611537 2.42163278
 2.7272743  2.58019633 2.5181404  2.4999174  2.96680749 2.44976284
 2.69192436 2.62929842 2.66114487 2.33022039 2.89545773 2.34263545
 2.47498002 2.54973426 2.46534215 2.68066287 3.08244022 2.7196811
 2.37751669 3.00066071 2.90400881 2.59812037 2.41959912 2.58500426
 2.6312868  2.74102805 2.46680567 2.13134834 2.92825559 2.39948178
 2.31508084 2.34707656 2.53675209 2.78624873 2.3888818  3.51323012
 2.5720671  2.62968687 2.9667208  2.79035365 2.38244692 2.91444972
 2.63520557 2.81100663 2.77528706 2.08399271 2.42352522 2.31999431
 3.26585829 2.57150379 2.25850184 2.47185944 2.62186891 2.85677572
 2.33951502 3.05876187 3.16495866 2.62499948 2.33020331 2.85006653
 2.21361964 2.55340366 2.66582284 2.98740294 2.57326843 2.70711095
 2.56758734 2.44978451 2.72492961 2.51024647 2.43319631 2.71206187
 2.50451483 2.53736569 2.2831656  2.61319423 2.63085543 2.5529961
 2.60957664 2.54012687 2.74424991 2.87978034 2.49713612 2.71912476
 2.86304113 2.75667294 2.5705897  2.42344674 2.53666435 2.5622477
 2.89446528 2.51806249 3.24853813 2.50522317]
Here are the Top 100 val results 
[2.70434539 2.71272422 2.54155575 2.34281316 2.4105274  2.56962021
 2.89867887 2.56590695 2.74711294 2.7688296  2.57543024 2.62262164
 2.43657055 2.4650659  2.66360002 2.32987748 2.69171923 2.81011212
 2.54318014 2.76684962 2.66328926 2.65537021 2.79255207 2.36956757
 2.93892578 2.33661849 2.60746483 2.60722717 2.35936274 2.36282233
 3.00268675 2.40952461 2.51102119 2.58400001 2.85493532 2.59434138
 2.59513805 2.41685341 3.09617751 2.60939392 2.6184092  2.60833284
 2.79968186 2.84530837 2.77146895 2.56137284 2.52396822 2.8437332
 2.77203979 2.6837158  2.96680749 2.48876746 2.84544658 2.7162458
 2.78585303 2.76953853 2.68870461 2.01189491 2.79737451 2.60102402
 2.63421645 2.46473936 2.77195517 2.44963392 2.90677506 2.62818157
 2.40390286 2.58750129 2.7866352  2.79929468 2.52698453 2.55141571
 2.69633657 2.41904756 2.67804332 2.86397115 2.58846323 3.08295758
 3.11576485 2.82138457 2.59264905 1.80988396 2.5043361  2.6971299
 2.58012534 2.6614414  2.87801944 2.37011646 2.56299069 2.52903681
 2.52617323 2.28664005 2.8182876  2.49545023 2.73210214 2.56496287
 2.57963131 2.49327267 2.92093433 2.49417268]
In [153]:
x_train_pred = rf_top.predict(X_train[top_features])
x_val_pred = rf_top.predict(X_val[top_features])
In [154]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,
                 trendline='ols')
fig.show()
In [155]:
el.add_metrics(y_train,\
               y_val,\
               X_train[top_features],\
               X_val[top_features],\
               rf_top,\
               model_desc)
In [156]:
el.get_metrics()
Out[156]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
47 RandomForestTopFeaturesgloveSynopsisgloveTitle 0.814 0.020 0.093 1.040 0.047 0.148 1.062
In [157]:
el.save_data(PROJECT_PATH)

SVM

In [158]:
model_desc = 'SVM' + encoding_strategy
svm = SVR()
params = {
          'kernel': ['rbf'], #['rbf','poly','sigmoid'],
          'degree': [3], #np.linspace(3,7,3,dtype='int64'),
          'C' : [1,.5,3], #np.linspace(.0001,12.5,3),
          'epsilon' : [.1],#np.linspace(.1,.3,2,dtype='float64'),
          'gamma': ['scale',.01,.1]
          }
logger.debug(params)
DEBUG:root:{'kernel': ['rbf'], 'degree': [3], 'C': [1, 0.5, 3], 'epsilon': [0.1], 'gamma': ['scale', 0.01, 0.1]}
In [159]:
svm = mb.build_model(svm,params,X_train,y_train,X_val,model_desc,
                    PROJECT_PATH,scoring='neg_root_mean_squared_error',
                     verbose=10)
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.9min finished
Best parameters for gs are {'C': 3, 'degree': 3, 'epsilon': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
Here are the Top 100 train results 
[2.71860236 2.79704817 2.80320737 2.72222194 2.70988884 2.50957464
 2.83595094 2.75564831 2.45956516 2.52630406 3.05611295 2.41681049
 2.71704432 2.60455944 2.63334182 2.41091824 3.14143173 2.43188156
 2.54746323 2.47495879 2.6080386  2.61159643 3.05560914 2.82419266
 2.36303371 2.90701032 2.92870286 2.65667918 2.3863164  2.56361984
 2.60850969 2.75173575 2.55903799 2.0553898  3.00386224 2.40742653
 2.28745693 2.23681924 2.42336046 2.71352403 2.37149779 3.3133675
 2.56303077 2.6519124  2.9788664  2.82194973 2.4708057  2.92525335
 2.68841964 2.80439867 2.75483785 2.16547047 2.39117194 2.31611542
 3.26750344 2.64929026 2.29902262 2.47148879 2.6614285  2.91678104
 2.32903956 2.9488124  2.99099726 2.63363416 2.37699913 2.80864395
 2.38565029 2.56109958 2.80180054 2.97096187 2.61910409 2.75992633
 2.70665848 2.47016293 2.5901601  2.41660228 2.38550427 2.71951674
 2.43930139 2.45504557 2.31468588 2.6671307  2.59975676 2.51159452
 2.59994723 2.53918952 2.81388069 2.8818958  2.48946653 2.66000028
 2.75563365 2.73074188 2.52163977 2.44704264 2.55088485 2.73389165
 2.7587873  2.45628502 3.1548531  2.54189927]
Here are the Top 100 val results 
[2.87428858 2.79444684 2.56254826 2.35868006 2.42319968 2.55472263
 2.9043867  2.53322521 2.77778929 2.64446392 2.56781679 2.61860068
 2.35862154 2.44512651 2.67296625 2.30792057 2.64977921 2.81292358
 2.60026428 2.70610671 2.71083758 2.78582058 2.88492004 2.47658567
 2.86257569 2.40153868 2.62414121 2.60966784 2.32610379 2.42320696
 3.03793082 2.52201394 2.50827882 2.58020266 2.80245269 2.66225023
 2.59909716 2.40803133 3.03348579 2.53813274 2.6813522  2.61151811
 2.78903087 2.75134599 2.83346268 2.45944116 2.52086849 2.74742866
 2.74015088 2.72984255 3.03758925 2.40770864 2.72184055 2.8292987
 2.81384905 2.74631633 2.73893135 2.05646408 2.78027957 2.57936088
 2.55463632 2.51404548 2.7813977  2.45896874 2.95208585 2.665176
 2.38720287 2.53975836 2.84949556 2.86637799 2.50597257 2.52017978
 2.7737982  2.46242378 2.75423541 2.68803567 2.55684053 3.12201365
 3.04409307 2.83479084 2.69623575 1.88082896 2.53639525 2.74233489
 2.62961062 2.69626714 2.82034666 2.33007688 2.79032822 2.52674618
 2.54297158 2.34842521 2.75547684 2.53209166 2.66688632 2.5736378
 2.55321652 2.45306583 2.81390333 2.528395  ]
In [160]:
x_train_pred = svm.predict(X_train)
x_val_pred = svm.predict(X_val)
In [161]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,\
                 trendline="ols")

fig.show()
In [162]:
el.add_metrics(y_train,y_val,X_train,X_val,svm,model_desc)
In [163]:
el.get_metrics()
Out[163]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
47 RandomForestTopFeaturesgloveSynopsisgloveTitle 0.814 0.020 0.093 1.040 0.047 0.148 1.062
48 SVMgloveSynopsisgloveTitle 0.627 0.041 0.143 1.057 0.047 0.156 1.061
In [164]:
el.save_data(PROJECT_PATH)
In [165]:
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,svm.predict(X_test)),\
                         name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)

XGBoost

All features
In [166]:
model_desc = 'XGBoost' + encoding_strategy
xgrf = XGBRegressor()
params = {
          'n_estimators': [100],
          'objective' : ['reg:squarederror'], #['mse','mae'],
          'max_depth' : [4], #np.linspace(3,4,2,dtype='int64'),
          'learning_rate': [.23], #np.linspace(.1,1,4),
          'gamma': [0], #np.linspace(0,10,4),
           'min_child_weight': [5,20], #np.linspace(1,10,3,dtype='int64'),
           'colsample_bytree': [.85], #np.linspace(.4,1,4),
           'subsample' : [1], #np.linspace(.7,1,3),
           'max_delta_step': [0], #[0,5],
           'reg_lambda' : [1], #np.linspace(1,2,2,dtype='int64'),
           'reg_alpha' : [0] #np.linspace(0,2,2,dtype='int64')
         }
logger.debug(params)
DEBUG:root:{'n_estimators': [100], 'objective': ['reg:squarederror'], 'max_depth': [4], 'learning_rate': [0.23], 'gamma': [0], 'min_child_weight': [5, 20], 'colsample_bytree': [0.85], 'subsample': [1], 'max_delta_step': [0], 'reg_lambda': [1], 'reg_alpha': [0]}
In [167]:
xgrf = mb.build_model(xgrf,params,X_train,y_train,X_val,model_desc,
                    PROJECT_PATH,scoring='neg_mean_squared_log_error',
                    verbose=10)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.0s finished
Best parameters for gs are {'colsample_bytree': 0.85, 'gamma': 0, 'learning_rate': 0.23, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}
Here are the Top 100 train results 
[2.6600738 2.8459616 2.7298124 2.742464  2.721769  2.3667612 2.8643587
 2.8867881 2.4755177 2.5985203 2.8842807 2.4635296 2.662631  2.5695467
 2.7148294 2.1822917 2.9170759 2.414874  2.4969459 2.5084605 2.4151368
 2.6617134 3.0628433 2.7799418 2.359201  3.0474539 2.9144876 2.5365422
 2.398371  2.5173194 2.5774329 2.7456875 2.4991155 2.092359  3.0065546
 2.478608  2.3076048 2.331245  2.5020335 2.715503  2.4063177 3.5246027
 2.5617042 2.5182567 3.0446553 2.7935574 2.464666  2.9745104 2.655312
 2.8114662 2.6909397 2.1121907 2.369197  2.1958227 3.2349665 2.645427
 2.3220186 2.5357738 2.6019692 2.8298566 2.3294015 3.0264418 3.2385325
 2.6249237 2.364984  2.7995272 1.9733145 2.5157828 2.64904   3.0282404
 2.5469627 2.6586928 2.638565  2.432939  2.8137457 2.4799914 2.3664315
 2.828429  2.521859  2.4822361 2.2647765 2.7488196 2.5874507 2.5911438
 2.578963  2.5156274 2.748304  2.939225  2.4720707 2.770088  2.8162527
 2.8356884 2.5151231 2.5268543 2.4643652 2.6971068 2.9148555 2.5659175
 3.220916  2.543447 ]
Here are the Top 100 val results 
[2.7880075 2.7665656 2.5538452 2.315329  2.308836  2.5641236 2.8099792
 2.5965989 2.8114073 2.7224743 2.5503423 2.6559618 2.475192  2.5087223
 2.653528  2.293168  2.6133194 2.7369378 2.6009293 2.8730981 2.7063944
 2.751276  2.8074229 2.500066  2.83038   2.3558114 2.6322734 2.5711787
 2.3688776 2.3610365 3.1300313 2.3737988 2.580709  2.591648  2.9131334
 2.6444705 2.5696816 2.442968  3.069544  2.5392842 2.6587954 2.557901
 2.8049521 2.9561248 2.839357  2.291839  2.6612809 2.8242264 2.918494
 2.7083216 2.8842807 2.3466449 2.788634  2.922073  2.881856  2.6900623
 2.704084  1.8556857 2.839855  2.741772  2.7291548 2.3916323 2.8651156
 2.5036438 2.9110415 2.6757529 2.4521646 2.6049168 2.8085783 2.9374943
 2.4501796 2.6389687 2.698381  2.449328  2.6281908 2.9192243 2.5346
 3.2186363 3.0521655 2.7625234 2.6710515 1.8225272 2.5201855 2.8042684
 2.5603764 2.8365693 2.879499  2.2949328 2.7273467 2.4278646 2.513689
 2.3356495 2.855352  2.6161308 2.7293577 2.4486983 2.5926504 2.4628813
 2.8396814 2.5011744]
In [168]:
x_train_pred = xgrf.predict(X_train)
x_val_pred = xgrf.predict(X_val)
In [169]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,\
                 trendline="ols")

fig.show()
In [170]:
el.add_metrics(y_train,y_val,X_train,X_val,xgrf,model_desc)
In [171]:
el.get_metrics()
Out[171]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
47 RandomForestTopFeaturesgloveSynopsisgloveTitle 0.814 0.020 0.093 1.040 0.047 0.148 1.062
48 SVMgloveSynopsisgloveTitle 0.627 0.041 0.143 1.057 0.047 0.156 1.061
49 XGBoostgloveSynopsisgloveTitle 0.852 0.016 0.092 1.036 0.046 0.154 1.061
In [172]:
el.save_data(PROJECT_PATH)
In [173]:
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,xgrf.predict(X_test)),\
                         name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
Top features
In [174]:
top_features = mb.plot_feature_importances(X_train,xgrf)
In [175]:
l = int(.50*len(top_features))
top_features = top_features.loc[:l,'feature']
In [176]:
model_desc = 'XGBoostTopFeatures' + encoding_strategy
xgrf_top = mb.build_model(xgrf,\
                        params,\
                        X_train[top_features],\
                        y_train,\
                        X_val[top_features],\
                        model_desc,\
                        PROJECT_PATH,\
                        scoring='neg_mean_squared_log_error',\
                        verbose=10)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   23.5s finished
Best parameters for gs are {'colsample_bytree': 0.85, 'gamma': 0, 'learning_rate': 0.23, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 100, 'objective': 'reg:squarederror', 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}
Here are the Top 100 train results 
[2.6514988 2.8438902 2.6867363 2.8507094 2.705443  2.400919  2.8872068
 2.936256  2.5364296 2.6058989 2.956048  2.469863  2.6991427 2.5530884
 2.7194958 2.1480694 2.9009638 2.3914495 2.437573  2.5546513 2.3984604
 2.674924  3.033125  2.7804325 2.3831348 3.0943305 2.9531145 2.5932329
 2.3549287 2.536842  2.6223943 2.692111  2.4756923 2.0908365 3.029957
 2.3283744 2.3093596 2.3403845 2.5099337 2.7346532 2.4382157 3.5357616
 2.6073842 2.5557396 2.9759748 2.7796845 2.4992309 2.938425  2.6738675
 2.8037724 2.7351599 2.1295447 2.4341192 2.0668592 3.237585  2.572492
 2.3248827 2.494007  2.5987682 2.8084655 2.3265152 3.102054  3.2952201
 2.6591024 2.3644085 2.7856724 2.0571673 2.556577  2.676617  3.0983095
 2.5846198 2.6811028 2.578989  2.4553435 2.7894528 2.452035  2.3578303
 2.7826724 2.5213344 2.4988623 2.239305  2.7286947 2.6436439 2.529862
 2.5246806 2.5351617 2.690113  2.9811244 2.4229698 2.7339134 2.8680966
 2.7643323 2.5388017 2.4747305 2.489394  2.59504   2.954595  2.5386426
 3.22407   2.5203524]
Here are the Top 100 val results 
[2.7092235 2.6920567 2.5865707 2.3815782 2.2994313 2.560005  2.8874667
 2.535402  2.7431948 2.5519743 2.6019325 2.6530182 2.390474  2.4504237
 2.7147179 2.3044672 2.676645  2.7027361 2.57787   2.8727577 2.9455047
 2.7555754 2.829356  2.4353333 2.84899   2.3836002 2.8261414 2.5745056
 2.365981  2.3802223 3.1259274 2.440857  2.6190107 2.604391  2.9715862
 2.653994  2.6837397 2.4542646 3.1230593 2.5888066 2.6607718 2.5432801
 2.8065906 2.8495603 2.8592315 2.2954392 2.5567486 2.7855453 2.8248322
 2.740486  2.956048  2.3962355 2.7166257 3.0583804 2.8342454 2.7376645
 2.7442303 2.0799456 2.9181697 2.703162  2.7487304 2.475165  2.9318771
 2.5634427 2.862989  2.6846325 2.387955  2.6095006 2.7627423 2.884802
 2.4294238 2.5874293 2.6681097 2.476428  2.571722  2.8525057 2.525364
 3.1000402 2.9899356 2.8075624 2.6985135 1.8568648 2.5203273 2.776468
 2.569928  2.816971  2.868755  2.3496509 2.7115288 2.4723043 2.4995909
 2.3587904 2.837208  2.505229  2.7379336 2.527037  2.6284034 2.340825
 2.8296463 2.5660875]
In [177]:
x_train_pred = xgrf_top.predict(X_train[top_features])
x_val_pred = xgrf_top.predict(X_val[top_features])
In [178]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,
                 trendline='ols')
fig.show()
In [179]:
el.add_metrics(y_train,\
               y_val,\
               X_train[top_features],\
               X_val[top_features],\
               xgrf_top,\
               model_desc)
In [180]:
el.get_metrics()
Out[180]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
47 RandomForestTopFeaturesgloveSynopsisgloveTitle 0.814 0.020 0.093 1.040 0.047 0.148 1.062
48 SVMgloveSynopsisgloveTitle 0.627 0.041 0.143 1.057 0.047 0.156 1.061
49 XGBoostgloveSynopsisgloveTitle 0.852 0.016 0.092 1.036 0.046 0.154 1.061
50 XGBoostTopFeaturesgloveSynopsisgloveTitle 0.854 0.016 0.091 1.035 0.046 0.153 1.061
In [181]:
el.save_data(PROJECT_PATH)

Stacking

In [184]:
model_desc = 'Stacking' + encoding_strategy
In [185]:
estimators = [
              ('en',en),
              ('dt',dt),
              ('rf',rf),
              ('rf_top',rf_top),
              ('svm',svm),
              ('xgrf',xgrf),
              ('xgrf_top',xgrf_top)
              ]

stack = StackingRegressor(
        estimators=estimators,
        final_estimator = XGBRegressor(objective='reg:squarederror'),
        n_jobs=-1,
        verbose=100)

stack.fit(X_train,y_train) 
Out[185]:
StackingRegressor(cv=None,
                  estimators=[('en',
                               ElasticNet(alpha=0.0001, copy_X=True,
                                          fit_intercept=True, l1_ratio=1,
                                          max_iter=1000, normalize=False,
                                          positive=False, precompute=False,
                                          random_state=None, selection='cyclic',
                                          tol=0.0001, warm_start=False)),
                              ('dt',
                               DecisionTreeRegressor(ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=5,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min...
                                               colsample_bynode=1,
                                               colsample_bytree=1, gamma=0,
                                               importance_type='gain',
                                               learning_rate=0.1,
                                               max_delta_step=0, max_depth=3,
                                               min_child_weight=1, missing=None,
                                               n_estimators=100, n_jobs=1,
                                               nthread=None,
                                               objective='reg:squarederror',
                                               random_state=0, reg_alpha=0,
                                               reg_lambda=1, scale_pos_weight=1,
                                               seed=None, silent=None,
                                               subsample=1, verbosity=1),
                  n_jobs=-1, passthrough=False, verbose=100)
In [186]:
#Save model
model_name = str(model_desc) + '.pkl'
pickle.dump(stack,open(os.path.join(PROJECT_PATH,\
                                      'model',model_name),'wb'))
In [187]:
x_train_pred = stack.predict(X_train)
x_val_pred = stack.predict(X_val)
In [188]:
fig = px.scatter(y=x_val_pred,\
                 x=y_val,
                 trendline='ols')
fig.show()
In [189]:
el.add_metrics(y_train,y_val,X_train,X_val,stack,model_desc)
In [190]:
el.get_metrics()
Out[190]:
model r_squared train_mse train_mae train_lmae val_mse val_mae val_lmae
0 XGBoostRF 0.224 0.086 0.220 1.084 0.088 0.220 1.086
1 Stacking 0.666 0.037 0.143 1.055 0.057 0.177 1.067
2 LinearReg 0.499 0.056 0.178 1.068 0.066 0.192 1.073
3 DecisionTree 0.369 0.070 0.198 1.077 0.086 0.221 1.084
4 RandomForest 0.693 0.034 0.136 1.053 0.064 0.186 1.071
5 RandomForestTopFeatures 0.701 0.033 0.134 1.052 0.063 0.186 1.071
6 SVM 0.164 0.093 0.223 1.088 0.107 0.243 1.093
7 XGBoost 0.676 0.036 0.143 1.054 0.060 0.181 1.069
8 XGBoostTopFeatures 0.693 0.034 0.140 1.053 0.059 0.180 1.069
9 RandomForestTopFeaturesglove 0.834 0.018 0.096 1.039 0.066 0.185 1.072
10 RandomForestglove 0.724 0.030 0.127 1.050 0.069 0.194 1.075
11 XGBoostTopFeaturesglove 0.774 0.025 0.118 1.045 0.067 0.194 1.074
12 LinearRegglove 0.902 0.011 0.063 1.029 0.013 0.068 1.033
13 DecisionTreeglove 0.880 0.013 0.067 1.033 0.015 0.071 1.035
14 SVMglove 0.946 0.006 0.062 1.022 0.014 0.080 1.034
15 XGBoostglove 0.880 0.013 0.065 1.033 0.014 0.067 1.034
16 LinearRegtfidf 0.647 0.039 0.145 1.056 299642795180295913472.000 400081986.400 1.831
17 ElasticNettfidf 0.638 0.040 0.146 1.056 0.045 0.155 1.060
18 DecisionTreetfidf 0.602 0.044 0.149 1.059 0.053 0.167 1.066
19 SVMtfidf 0.766 0.026 0.114 1.045 0.042 0.149 1.058
20 XGBoosttfidf 0.784 0.024 0.110 1.043 0.043 0.148 1.059
21 XGBoostTopFeaturestfidf 0.781 0.024 0.111 1.044 0.043 0.147 1.059
22 RandomForesttfidf 0.858 0.016 0.078 1.035 0.044 0.143 1.059
23 RandomForestTopFeaturestfidf 0.859 0.015 0.078 1.035 0.044 0.143 1.059
24 Stackingtfidf 0.813 0.020 0.098 1.040 0.041 0.143 1.057
25 LinearReggloveSynopsistfidfTitle 0.633 0.040 0.148 1.057 0.048 0.161 1.062
26 ElasticNetgloveSynopsistfidfTitle 0.621 0.041 0.149 1.057 0.046 0.157 1.061
27 DecisionTreegloveSynopsistfidfTitle 0.548 0.049 0.160 1.063 0.055 0.170 1.066
28 SVMgloveSynopsistfidfTitle 0.751 0.027 0.118 1.046 0.045 0.153 1.060
29 XGBoostgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.149 1.060
30 XGBoostTopFeaturesgloveSynopsistfidfTitle 0.832 0.018 0.098 1.038 0.045 0.151 1.060
31 RandomForestgloveSynopsistfidfTitle 0.863 0.015 0.076 1.034 0.046 0.147 1.061
32 RandomForestTopFeaturesgloveSynopsistfidfTitle 0.863 0.015 0.077 1.034 0.046 0.147 1.061
33 StackinggloveSynopsistfidfTitle 0.820 0.020 0.099 1.040 0.043 0.146 1.059
34 LinearRegtfidfSynopsistfidfTitle 0.649 0.039 0.145 1.055 6656443694119302144.000 59630466.965 1.752
35 ElasticNettfidfSynopsistfidfTitle 0.639 0.040 0.146 1.056 0.045 0.155 1.060
36 DecisionTreetfidfSynopsistfidfTitle 0.601 0.044 0.149 1.059 0.053 0.165 1.065
37 RandomForesttfidfSynopsistfidfTitle 0.855 0.016 0.078 1.035 0.044 0.143 1.059
38 RandomForestTopFeaturestfidfSynopsistfidfTitle 0.857 0.016 0.078 1.035 0.044 0.143 1.059
39 SVMtfidfSynopsistfidfTitle 0.767 0.025 0.114 1.045 0.042 0.149 1.058
40 XGBoosttfidfSynopsistfidfTitle 0.782 0.024 0.110 1.044 0.042 0.147 1.058
41 XGBoostTopFeaturestfidfSynopsistfidfTitle 0.803 0.022 0.106 1.041 0.044 0.150 1.060
42 StackingtfidfSynopsistfidfTitle 0.815 0.020 0.098 1.040 0.041 0.144 1.057
43 LinearReggloveSynopsisgloveTitle 0.599 0.044 0.154 1.059 0.050 0.163 1.063
44 ElasticNetgloveSynopsisgloveTitle 0.592 0.045 0.154 1.060 0.048 0.159 1.062
45 DecisionTreegloveSynopsisgloveTitle 0.549 0.049 0.159 1.063 0.054 0.168 1.066
46 RandomForestgloveSynopsisgloveTitle 0.860 0.015 0.077 1.034 0.047 0.147 1.061
47 RandomForestTopFeaturesgloveSynopsisgloveTitle 0.814 0.020 0.093 1.040 0.047 0.148 1.062
48 SVMgloveSynopsisgloveTitle 0.627 0.041 0.143 1.057 0.047 0.156 1.061
49 XGBoostgloveSynopsisgloveTitle 0.852 0.016 0.092 1.036 0.046 0.154 1.061
50 XGBoostTopFeaturesgloveSynopsisgloveTitle 0.854 0.016 0.091 1.035 0.046 0.153 1.061
51 StackinggloveSynopsisgloveTitle 0.803 0.022 0.102 1.041 0.045 0.151 1.060
In [191]:
el.save_data(PROJECT_PATH)
In [192]:
file_name = model_desc + '.xlsx'
x_test_pred = pd.Series(pow(10,stack.predict(X_test)),\
                         name ='Price')
x_test_pred.to_excel(os.path.join(PROJECT_PATH,file_name),index=False)
In [193]:
#Loading the file.
file_name = str(model_desc) + '.pkl'
stack = pickle.load(open(os.\
                      path.\
                      join(PROJECT_PATH,\
                           'model',\
                           file_name),'rb'))

Analyzing results

In [194]:
mod = stack
In [195]:
X_tr_temp = X_train.loc[X_tr.index,:]
y_tr_temp = y_train[X_tr_temp.index]
x_pred_temp = pd.Series(mod.predict(X_tr_temp),\
                       name='y_pred',\
                        index=X_tr_temp.index)

X_tr_merge_temp = pd.concat([X_tr_temp,y_tr_temp,x_pred_temp],\
                            axis=1)
X_tr_merge_temp['error'] = pow(10,\
                               np.\
                               sqrt(np.\
                                    square(np.\
                                           log10(1+X_tr_merge_temp['LogPrice']) - np.\
                                           log10(1+X_tr_merge_temp['y_pred']))))
X_tr_merge_temp.sort_values('error',ascending=False,inplace=True)
In [198]:
top_features = mb.plot_feature_importances(X_train,rf)
n_features = top_features.loc[:15,'feature']
l = pd.Series(['LogPrice','y_pred','error'])
n_features = n_features.append(l)
X_tr_merge_temp[n_features].head()
Out[198]:
Author_LogPrice_mean Genre_LogPrice_mean Author_LogPrice_amin Author_LogPrice_amax rating_num Author_LogPrice_std synopsis_len review_num Genre_LogPrice_amin Genre_LogPrice_std Title2 Synopis95 Title0 Synopis59 Synopis70 Title8 LogPrice y_pred error
3897 0.546 0.430 0.443 0.835 0.130 0.690 0.022 0.925 0.165 0.278 0.104 0.275 0.087 0.277 0.176 0.104 3.748 2.819 1.243
2442 0.593 0.430 0.472 0.684 0.665 0.364 0.099 0.875 0.165 0.278 0.238 0.319 0.106 0.587 0.358 0.454 3.381 2.666 1.195
2441 0.593 0.430 0.472 0.684 0.653 0.364 0.099 0.875 0.165 0.278 0.238 0.319 0.106 0.587 0.358 0.454 3.381 2.666 1.195
6149 0.527 0.569 0.196 0.815 0.000 0.000 0.073 1.000 0.474 0.363 0.189 0.275 0.218 0.241 0.191 0.457 2.456 3.018 1.162
5064 0.554 0.460 0.261 0.883 0.117 0.000 0.091 1.000 0.051 0.310 0.284 0.450 0.690 0.288 0.276 0.086 2.210 2.698 1.152
In [199]:
train_data.loc[2274,:]
Out[199]:
index                                                                                                                        2702
Title                                                             The ILLUSION OF LIFE: DISNEY ANIMATION (Disney Editions Deluxe)
Author                                                                                                               Frank Thomas
Edition                                                                                                    Hardcover,– 5 Oct 1995
Reviews                                                                                                        5.0 out of 5 stars
Ratings                                                                                                        6 customer reviews
Synopsis                      An out-of-print collector's item since 1986, the definitive account of the development of Disney...
Genre                                                                                  Children's Mysteries & Curiosities (Books)
BookCategory                                                                                             Arts, Film & Photography
Price                                                                                                                    3439.000
LogPrice                                                                                                                    3.536
LogPriceBucket                                                                                                          Very High
title_len                                                                                                                   9.000
Author_clean                                                                                                          frankthomas
top_Author                                                                                                           other_Author
Author_Title_count                                                                                                          0.000
Author_LogPrice_amin                                                                                                        1.398
Author_LogPrice_mean                                                                                                        2.773
Author_LogPrice_amax                                                                                                        4.069
Author_LogPrice_std                                                                                                         0.000
bind                                                                                                                    Hardcover
sour                                                                                                                   Other_sour
month                                                                                                                      10.000
year                                                                                                                     1995.000
top_bind                                                                                                                Hardcover
bind_Title_count                                                                                                          823.000
bind_LogPrice_amin                                                                                                          1.708
bind_LogPrice_mean                                                                                                          2.813
bind_LogPrice_amax                                                                                                          4.122
bind_LogPrice_std                                                                                                           0.386
top_sour                                                                                                               Other_sour
sour_Title_count                                                                                                         5451.000
sour_LogPrice_amin                                                                                                          1.398
sour_LogPrice_mean                                                                                                          2.591
sour_LogPrice_amax                                                                                                          4.149
sour_LogPrice_std                                                                                                           0.326
review_num                                                                                                                  5.000
rating_num                                                                                                                  1.204
top_Genre                                                                                                             other_Genre
Genre_Title_count                                                                                                           5.000
Genre_LogPrice_amin                                                                                                         2.210
Genre_LogPrice_mean                                                                                                         2.805
Genre_LogPrice_amax                                                                                                         3.536
Genre_LogPrice_std                                                                                                          0.476
BookCategory_Title_count                                                                                                  517.000
BookCategory_LogPrice_amin                                                                                                  1.398
BookCategory_LogPrice_mean                                                                                                  2.773
BookCategory_LogPrice_amax                                                                                                  4.069
BookCategory_LogPrice_std                                                                                                   0.358
synopsis_len                                                                                                               37.000
year_month                                                                                                             199510.000
title_seq                                                                                                                   1.000
title_seq_max                                                                                                               1.500
review_num_lag1                                                                                                             0.000
review_num_lag2                                                                                                             0.000
review_num_lag3                                                                                                             0.000
rating_num_lag1                                                                                                             0.000
rating_num_lag2                                                                                                             0.000
rating_num_lag3                                                                                                             0.000
Name: 2274, dtype: object
In [200]:
X_train.loc[6407,:]
Out[200]:
title_len              0.161
Author_Title_count     0.000
Author_LogPrice_amin   0.135
Author_LogPrice_mean   0.441
Author_LogPrice_amax   0.835
                        ... 
Synopis95              0.308
Synopis96              0.169
Synopis97              0.249
Synopis98              0.175
Synopis99              0.289
Name: 6407, Length: 254, dtype: float64
In [201]:
fig = px.scatter(data_frame=X_tr_merge_temp,
                 y='Author_LogPrice_mean',\
                 x='error',\
                 hover_data=[X_tr_merge_temp.index])

fig.show()
In [202]:
X_train_title.loc[6407,:].sort_values(ascending=False)
Out[202]:
Title1   0.656
Title0   0.543
Title2   0.343
Title3   0.212
Title4   0.188
Title5   0.100
Title7   0.074
Title8   0.072
Title6   0.049
Name: 6407, dtype: float32
In [203]:
X_train_synopsis.loc[6407,:].sort_values(ascending=False)
Out[203]:
Synopis51   0.722
Synopis28   0.623
Synopis52   0.607
Synopis68   0.607
Synopis66   0.583
             ... 
Synopis64   0.157
Synopis9    0.147
Synopis87   0.144
Synopis10   0.132
Synopis61   0.131
Name: 6407, Length: 100, dtype: float32

HTML

In [205]:
!jupyter nbconvert --to html 'MachineHack_PredictPriceBookNC.ipynb'
[NbConvertApp] Converting notebook MachineHack_PredictPriceBookNC.ipynb to html
[NbConvertApp] Writing 4027406 bytes to MachineHack_PredictPriceBookNC.html